This module builds on code contained in Coronavirus_Statistics_USAF_v004.Rmd. This file includes the latest code for analyzing data from USA Facts. USA Facts maintains data on cases and deaths by county for coronavirus in the US. Downloaded data are unique by county with date as a column and a separate file for each of cases, deaths, and population.
The intent of this code is to source updated functions that allow for readRunUSAFacts() to be run to obtain, read, process, and analyze data from USA Facts.
The tidyverse library is loaded, and the functions used for CDC daily processing are sourced. Additionally, specific functions for USA Facts are also sourced:
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.1.1 v dplyr 1.0.6
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
# Functions are available in source file
source("./Generic_Added_Utility_Functions_202105_v001.R")
source("./Coronavirus_CDC_Daily_Functions_v001.R")
source("./Coronavirus_USAF_Functions_v001.R")
Further, the mapping file specific to USA Facts is sourced:
source("./Coronavirus_USAF_Default_Mappings_v001.R")
A few functions should be added back to Generic_Added_…v001.R and Coronavirus_CDC_Daily…_v001.R after they have been more thoroughly checked for compatibility with the state-based clustering. For now, they are included below so as to over-write the function obtained from source(…):
# Updated function for handling county-level clusters
createSummary <- function(df,
stateClusterDF=NULL,
brewPalette=NA,
dataType="state"
) {
# FUNCTION ARGUMENTS:
# df: an integrated data frame by cluster-date
# stateClusterDF: a data frame containing state-cluster (NULL means it can be found in df)
# brewPalette: character string for a palette from RColorBrewer to be used (NA means default colors)
# dataType: the type of maps being produced ("state" or "county")
# Create plots that can be relevant for a dashboard, including:
# 1. Map of segments
# 2. Bar plot of counts by segment
# 3. Facetted bar plot of segment descriptors (e.g., population, burden per million)
# 4. Facetted trend-line plot of burden by segments
# Create a map of the clusters
p1 <- helperSummaryMap(if(is.null(stateClusterDF)) df else stateClusterDF,
mapLevel=if(dataType=="state") "states" else "counties",
keyCol=if(dataType=="state") "state" else "countyFIPS",
discreteValues=TRUE,
labelScale=is.na(brewPalette),
textLabel=if(dataType=="state") c("RI", "CT", "DE", "MD", "DC") else c(),
extraArgs=if(is.na(brewPalette)) list() else
list("arg1"=scale_fill_brewer("Cluster", palette=brewPalette))
)
# Create a bar plot of counts by segment
p2 <- helperSummaryMap(if(is.null(stateClusterDF)) df else stateClusterDF,
mapLevel=if(dataType=="state") "states" else "counties",
keyCol=if(dataType=="state") "state" else "countyFIPS",
discreteValues=TRUE,
labelScale=is.na(brewPalette),
countOnly=TRUE,
extraArgs=if(is.na(brewPalette)) list() else
list("arg1"=scale_fill_brewer("Cluster", palette=brewPalette))
)
# Create plot for population and burden by cluster
p3 <- df %>%
helperAggTotal(aggVars=c("pop", "wm_tcpm7", "wm_tdpm7"),
mapper=c("pop"="Population (millions)",
"wm_tcpm7"="Cases per thousand",
"wm_tdpm7"="Deaths per million"
),
xLab=NULL,
yLab=NULL,
title=NULL,
divideBy=c("pop"=1000000, "wm_tcpm7"=1000),
extraArgs=if(is.na(brewPalette)) list() else
list("arg1"=scale_fill_brewer("Cluster", palette=brewPalette))
)
# Create plot for cumulative burden per million over time
p4xtra <- list(arg1=scale_x_date(date_breaks="2 months", date_labels="%b-%y"),
arg2=theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
)
if(!is.na(brewPalette)) p4xtra$arg3 <- scale_color_brewer("Cluster", palette=brewPalette)
p4 <- df %>%
helperAggTrend(aggVars=append(c("wm_tcpm7", "wm_tdpm7"), if(dataType=="state") "wm_hpm7" else NULL),
mapper=c("wm_tcpm7"="Cases per thousand\n(cumulative)",
"wm_tdpm7"="Deaths per million\n(cumulative)",
"wm_hpm7"="Hospitalized per million\n(current)"
),
yLab=NULL,
title=NULL,
divideBy=c("wm_tcpm7"=1000),
linesize=0.75,
extraArgs=p4xtra
)
list(p1=p1, p2=p2, p3=p3, p4=p4)
}
# Helper function to make a summary map
helperSummaryMap <- function(df,
mapLevel="states",
keyCol="state",
values="cluster",
discreteValues=NULL,
legend.position="right",
labelScale=TRUE,
extraArgs=list(),
countOnly=FALSE,
textLabel=c(),
...
) {
# FUNCTION ARGUMENTS:
# df: a data frame containing a level of geography and an associated cluster
# mapLevel: a parameter for whether the map is "states" or "counties"
# keyCol: the key column for plotting (usmap::plot_usmap is particular, and this must be 'state' or 'fips')
# values: the character name of the field containing the data to be plotted
# discreteValues: boolean for whether the values are discrete (if not, use continuous)
# NULL means infer from data
# legend.position: character for the location of the legend in the plot
# labelScale: boolean, should an scale_fill_ be created? Use FALSE if contained in extraArgs
# extraArgs: list of other arguments that will be appended as '+' to the end of the usmap::plot_usmap call
# countOnly: should a bar plot of counts only be produced?
# textLabel: a list of elements that should be labelled as text on the plot (too small to see)
# ...: other parameters to be passed to usmap::plot_usmap (e.g., labels, include, exclude, etc.)
# Modify the data frame to contain only the relevant data
df <- df %>%
select(all_of(c(keyCol, values))) %>%
distinct()
# Determine the type of data being plotted
if (is.null(discreteValues)) discreteValues <- !is.numeric(df[[values]])
# Convert data type if needed
if (isTRUE(discreteValues) & is.numeric(df[[values]]))
df[[values]] <- factor(df[[values]])
# If count only is needed, create a count map; otherwise create a map
if (isTRUE(countOnly)) {
gg <- df %>%
ggplot(aes(x=fct_rev(get(values)))) +
geom_bar(aes_string(fill=values)) +
stat_count(aes(label=..count.., y=..count../2),
geom="text",
position="identity",
fontface="bold"
) +
coord_flip() +
labs(y="Number of members", x="")
} else {
if(keyCol=="countyFIPS") {
df <- df %>% colRenamer(vecRename=c("countyFIPS"="fips"))
keyCol <- "fips"
}
gg <- usmap::plot_usmap(regions=mapLevel, data=df, values=values, ...)
if (length(textLabel) > 0) {
labDF <- df %>%
filter(get(keyCol) %in% textLabel) %>%
mutate(rk=match(get(keyCol), textLabel)) %>%
arrange(rk) %>%
mutate(lon=-70.1-seq(0, 0.8*length(textLabel)-0.8, by=0.8),
lat=40.1-seq(0, 1.5*length(textLabel)-1.5, by=1.5)
) %>%
select(lon, lat, everything()) %>%
usmap::usmap_transform()
gg <- gg + geom_text(data=labDF,
aes(x=lon.1, y=lat.1, label=paste(get(keyCol), get(values))),
size=3.25
)
}
}
# Position the legend as requested
gg <- gg + theme(legend.position=legend.position)
# Create the scale if appropriate
if (isTRUE(labelScale)) gg <- gg +
if(isTRUE(discreteValues)) scale_fill_discrete(values) else scale_fill_continuous(values)
# Apply extra arguments
for (ctr in seq_along(extraArgs)) gg <- gg + extraArgs[[ctr]]
# Return the map object
gg
}
# Function to pivot the data file longer
pivotData <- function(df,
pivotKeys,
nameVar="name",
valVar="value",
toLonger=TRUE,
...
) {
# FUNCTION ARGUMENTS:
# df: the data frame
# pivotKeys: the keys (everything but cols for pivot_longer, id_cols for pivot_wider)
# nameVar: variable name for names_to or names_from
# valVar: variable name for values_to or values_from
# toLonger: boolean, should pivot_longer() be used rather than pivot_wider()?
# ...: other arguments to be passed to pivot_*()
if (isTRUE(toLonger)) pivot_longer(df, -all_of(pivotKeys), names_to=nameVar, values_to=valVar, ...)
else pivot_wider(df, all_of(pivotKeys), names_from=all_of(nameVar), values_from=all_of(valVar), ...)
}
An existing processed USA Facts list is loaded for use as the comparison set:
cty_newformat_20201026 <- readFromRDS("cty_newformat_20201026")
A function is added to create a county-level cluster map with state borders:
sparseCountyClusterMap <- function(vec,
clustRemap=c("999"=NA),
caption=NULL,
brewPalette=NULL,
naFill="white"
) {
# FUNCTION ARGUMENTS:
# vec: a named vector where the names are countyFIPS and the values are the clusters
# can also be a data.frame, in which case clustersToFrame() and clustRemap are not used
# clustRemap: remapping vector for clusters
# caption: caption to be included (NULL means no caption)
# brewPalette: name of a palette for scale_fill_brewer()
# NULL means use scale_fill_discrete() instead
# naFill: fill to use for NA counties
# Convert to a tibble for use with usmap if not already a data.frame
if ("data.frame" %in% class(vec)) {
df <- vec
} else {
df <- clustersToFrame(vec, colNameName="fips", convFactor=FALSE) %>%
mutate(cluster=as.character(cluster),
cluster=ifelse(cluster %in% names(clustRemap), clustRemap[cluster], cluster)
)
}
# Create a blank state map with black lines
blankStates <- usmap::plot_usmap("states")
# Create a county cluster map with NA values excluded
dataCounties <- df %>%
filter(!is.na(cluster)) %>%
usmap::plot_usmap(regions="counties", data=., values="cluster")
# Integrate as a ggplot object
p1 <- ggplot() +
geom_polygon(data=dataCounties[[1]],
aes(x=x, y=y, group=group, fill=dataCounties[[1]]$cluster),
color = NA,
size = 0.1
) +
geom_polygon(data=blankStates[[1]],
aes(x=x, y=y, group=group),
color = "black",
lwd=1.25,
fill = alpha(0.001)
) +
coord_equal() +
ggthemes::theme_map()
# Add the appropriate fill (can use viridis_d also)
if (is.null(brewPalette)) p1 <- p1 + scale_fill_discrete("Cluster", na.value=naFill)
else if (brewPalette=="viridis") p1 <- p1 + scale_fill_viridis_d("Cluster", na.value=naFill)
else p1 <- p1 + scale_fill_brewer("Cluster", palette=brewPalette, na.value=naFill)
# Add caption if requested
if (!is.null(caption)) p1 <- p1 + labs(caption=caption)
# Return the plotting object
p1
}
The function is then run using previously created segments:
sparseCountyClusterMap(cty_newformat_20201026$useClusters,
brewPalette="Paired",
caption="Counties with population under 25k are blank"
)
Next steps are to load new data and compare against previous, while using existing segments:
readList <- list("usafCase"="./RInputFiles/Coronavirus/covid_confirmed_usafacts_downloaded_20210608.csv",
"usafDeath"="./RInputFiles/Coronavirus/covid_deaths_usafacts_downloaded_20210608.csv"
)
compareList <- list("usafCase"=cty_newformat_20201026$dfRaw$usafCase,
"usafDeath"=cty_newformat_20201026$dfRaw$usafDeath
)
# Create new clusters
cty_newdata_20210608 <- readRunUSAFacts(maxDate="2021-06-06",
downloadTo=lapply(readList,
FUN=function(x) if(file.exists(x)) NA else x
),
readFrom=readList,
compareFile=compareList,
writeLog="./RInputFiles/Coronavirus/USAF_NewData_20210608_v005.log",
ovrwriteLog=TRUE,
useClusters=cty_newformat_20201026$useClusters,
skipAssessmentPlots=FALSE,
brewPalette="Paired"
)
##
## No file has been downloaded, will use existing file: ./RInputFiles/Coronavirus/covid_confirmed_usafacts_downloaded_20210608.csv
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## `County Name` = col_character(),
## State = col_character(),
## StateFIPS = col_character()
## )
## i Use `spec()` for the full column specifications.
##
## *** File has been checked for uniqueness by: countyFIPS countyName state stateFIPS
##
##
## *** File has been checked for uniqueness by: countyFIPS stateFIPS date
##
##
## Checking for similarity of: column names
## In reference but not in current:
## In current but not in reference:
##
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 225
## Detailed differences available in: ./RInputFiles/Coronavirus/USAF_NewData_20210608_v005.log
##
## Checking for similarity of: county
## In reference but not in current: 00001 02270 06000
## In current but not in reference:
##
##
## ***Differences of at least 5 and at least 5%
##
## 14 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20210608_v005.log
##
##
## ***Differences of at least 0 and at least 0.1%
##
## 551 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20210608_v005.log
##
##
## No file has been downloaded, will use existing file: ./RInputFiles/Coronavirus/covid_deaths_usafacts_downloaded_20210608.csv
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## `County Name` = col_character(),
## State = col_character(),
## StateFIPS = col_character()
## )
## i Use `spec()` for the full column specifications.
##
## *** File has been checked for uniqueness by: countyFIPS countyName state stateFIPS
##
##
## *** File has been checked for uniqueness by: countyFIPS stateFIPS date
##
##
## Checking for similarity of: column names
## In reference but not in current:
## In current but not in reference:
##
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 225
## Detailed differences available in: ./RInputFiles/Coronavirus/USAF_NewData_20210608_v005.log
##
## Checking for similarity of: county
## In reference but not in current: 00001 02270 06000
## In current but not in reference:
##
##
## ***Differences of at least 5 and at least 5%
##
## 29 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20210608_v005.log
##
##
## ***Differences of at least 0 and at least 0.1%
##
## 640 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20210608_v005.log
##
##
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 4
## isType cases new_cases n
## <chr> <dbl> <dbl> <dbl>
## 1 before 6.20e+9 3.28e+7 1602886
## 2 after 6.18e+9 3.27e+7 1577284
## 3 pctchg 4.58e-3 3.84e-3 0.0160
##
##
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 4
## isType deaths new_deaths n
## <chr> <dbl> <dbl> <dbl>
## 1 before 1.26e+8 590404 1602886
## 2 after 1.25e+8 589015 1577284
## 3 pctchg 3.75e-3 0.00235 0.0160
## NULL
sparseCountyClusterMap(cty_newdata_20210608$useClusters,
brewPalette="Paired",
caption="Counties with population under 25k are blank"
)
There has been significant convergence among segments in average deaths per million and cases per million. This is suggestive of several possibilities, such as that growth in burden may be inversely proportional to previous burden. Next steps are to create segments using the most recent data, seeking to identify differences in 1) cumulative burden, primarily defined by deaths, and 2) shape of the curve in getting to cumulative burden.
New segments are created, with assessments:
readList <- list("usafCase"="./RInputFiles/Coronavirus/covid_confirmed_usafacts_downloaded_20210608.csv",
"usafDeath"="./RInputFiles/Coronavirus/covid_deaths_usafacts_downloaded_20210608.csv"
)
# Create new clusters
cty_newsegs_20210608 <- readRunUSAFacts(maxDate="2021-06-06",
writeLog="./RInputFiles/Coronavirus/USAF_NewSegs_20210608_v005.log",
ovrwriteLog=TRUE,
dfPerCapita=cty_newdata_20210608$dfPerCapita,
useClusters=NULL,
skipAssessmentPlots=FALSE,
brewPalette="Paired",
defaultCluster="999",
minPopCluster=40000,
hierarchical=NA,
minShape="2020-04",
maxShape="2021-05",
ratioDeathvsCase = 5,
ratioTotalvsShape = 0.5,
minDeath=100,
minCase=5000,
hmlSegs=3,
eslSegs=3,
seed=2106091243
)
##
## *** File has been checked for uniqueness by: countyFIPS date
## NULL
sparseCountyClusterMap(cty_newsegs_20210608$useClusters,
brewPalette="Paired",
caption="Counties with population under 40k are blank"
)
The function createSummary() is updated to 1) create the sparse summary map; and 2) exclude the small county segment from select plots:
# Function to create diagnoses and plots for clustering data
diagnoseClusters <- function(lst,
lstExtract=fullListExtract,
clusterFrame=NULL,
brewPalette=NA,
clusterType="state",
printSummary=TRUE,
printDetailed=TRUE
) {
# FUNCTION ARGUMENTS:
# lst: a list containing processed clustering data
# lstExtract: the elements to extract from lst with an optional function for converting the elements
# NULL means use the extracted element as-is
# clusterFrame: tibble of the clusters to be plotted
# NULL means create from lst
# brewPalette: the color palette to use with scale_*_brewer()
# default NA means use the standard color/fill profile
# clusterType: character variable of form "state" for state clusters and "county" for county
# printSummary: boolean, should summary plots be printed to the log?
# printDetailed: boolean, should detailed plots be printed to the log?
# Get the key variable (used for joins and the like)
if (clusterType=="state") keyVar <- "state"
else if (clusterType=="county") keyVar <- "countyFIPS"
else stop(paste0("\nThe passed clusterType: ", clusterType, " is not programmed\n"))
# Create clusterFrame from lst if it has been passed as NULL
if (is.null(clusterFrame)) clusterFrame <- clustersToFrame(lst, colNameName=keyVar)
# Create the integrated and aggregate data from lst
dfFull <- integrateData(lst, lstExtract=lstExtract, otherDF=list(clusterFrame), keyJoin=keyVar)
dfAgg <- combineAggData(dfFull, aggBy=plotCombineAggByMapper[[clusterType]])
# Create the main summary plots
summaryPlots <- createSummary(dfAgg,
stateClusterDF=clusterFrame,
brewPalette=brewPalette,
dataType=clusterType
)
# Create the detailed summaries
detPlots <- createDetailedSummaries(dfDetail=dfFull,
dfAgg=dfAgg,
detVar=keyVar,
p2DetMetrics=plotCombineAggByMapper[[clusterType]]$agg2$aggVars,
brewPalette=brewPalette
)
# Print the summary plots if requested (use the sparse county plotting)
if (isTRUE(printSummary)) {
gridExtra::grid.arrange(summaryPlots$p5 + theme(legend.position="none"),
summaryPlots$p3 + theme(legend.position="left"),
summaryPlots$p4,
layout_matrix=rbind(c(1, 2),
c(3, 3)
)
)
}
# Print the detailed plots if requested
if (isTRUE(printDetailed)) purrr::walk(detPlots, .f=print)
# Return a list of the key plotting files
list(dfFull=dfFull,
dfAgg=dfAgg,
plotClusters=clusterFrame,
summaryPlots=summaryPlots,
detPlots=detPlots
)
}
# Function to create detailed cluster summaries
createDetailedSummaries <- function(dfDetail,
dfAgg,
aggVar=c("cluster"),
detVar=c("state"),
p1Metrics=c("tcpm", "tdpm"),
p1Order=c("tdpm"),
p2DetMetrics=c("tcpm7", "tdpm7", "cpm7", "dpm7", "hpm7"),
p2AggMetrics=paste0("wm_", p2DetMetrics),
p3Metrics=p1Metrics,
p3Days=30,
p3Slope=0.25,
mapper=c("tcpm"="Cases per million\n(cumulative)",
"tdpm"="Deaths per million\n(cumulative)",
"cpm7"="Cases\nper million",
"dpm7"="Deaths\nper million",
"hpm7"="Hospitalized\nper million",
"tdpm7"="Deaths (cum)\nper million",
"tcpm7"="Cases (cum)\nper million"
),
brewPalette=NA
) {
# FUNCTION ARGUMENTS:
# dfDetail: data frame or tibble containing detailed (sub-cluster) data
# dfAgg: data frame or tibble containing aggregated (cluster) data
# aggVar: variable reflecting the aggregate level
# detVar: variable reflecting the detailed level
# p1Metrics: metrics to be shown for plot 1 (will be faceted)
# p1Order: variable for ordering detVar in p1
# p2DetMetrics: variables to be included from dfDetail for p2
# p2AggMetrics: corresponding variables to be included from dfAgg for p2
# p3Metrics: metrics to be included in the growth plots
# p3Days: number of days to include for calculating growth
# p3Slope: the slope for the dashed line for growth in p3
# mapper mapping file for variable name to descriptive name
# brewPalette: character string for a palette from RColorBrewer to be used (NA means default colors)
# Create plot for aggregates by sub-cluster
if(detVar=="state") {
p1 <- dfDetail %>%
colSelector(vecSelect=c(detVar, aggVar, "date", p1Metrics)) %>%
pivot_longer(all_of(p1Metrics)) %>%
filter(!is.na(value)) %>%
group_by_at(detVar) %>%
filter(date==max(date)) %>%
ungroup() %>%
ggplot(aes(x=fct_reorder(get(detVar),
value,
.fun=function(x) { max(ifelse(name==p1Order, x, 0)) }
),
y=value
)
) +
geom_col(aes(fill=get(aggVar))) +
facet_wrap(~mapper[name], scales="free_x") +
coord_flip() +
labs(title="Cumulative burden", x=NULL, y=NULL)
if (!is.na(brewPalette)) p1 <- p1 + scale_fill_brewer("Cluster", palette=brewPalette)
} else {
# Do not create the plot for other than state-level data
p1 <- NULL
}
# Create facetted burden trends by aggregate
# For state-level, create each state as a line
# For anything else, create a 10%-90% range
if (detVar=="state") {
p2 <- dfDetail %>%
colSelector(vecSelect=c(detVar, "date", aggVar, p2DetMetrics)) %>%
pivot_longer(all_of(p2DetMetrics)) %>%
filter(!is.na(value)) %>%
ggplot(aes(x=date, y=value)) +
geom_line(aes_string(group=detVar), color="grey", size=0.5) +
facet_grid(mapper[name] ~ get(aggVar), scales="free_y") +
scale_x_date(date_breaks="2 months", date_labels="%b-%y") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
labs(x=NULL, y=NULL, title="Burden by cluster and metric")
} else {
p2 <- dfDetail %>%
colSelector(vecSelect=c(detVar, "date", aggVar, p2DetMetrics)) %>%
pivot_longer(all_of(p2DetMetrics)) %>%
filter(!is.na(value)) %>%
group_by_at(c(aggVar, "name", "date")) %>%
summarize(p10=unname(quantile(value, 0.1)), p90=unname(quantile(value, 0.9)), .groups="drop") %>%
ggplot(aes(x=date)) +
geom_ribbon(aes(ymin=p10, ymax=p90), alpha=0.75) +
facet_grid(mapper[name] ~ get(aggVar), scales="free_y") +
scale_x_date(date_breaks="2 months", date_labels="%b-%y") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
labs(x=NULL,
y=NULL,
title="Burden by cluster and metric",
subtitle="Shaded region is 10%le through 90%le, solid line is weighted mean"
)
}
aggPlot <- dfAgg %>%
colSelector(vecSelect=c("date", aggVar, p2AggMetrics)) %>%
colRenamer(vecRename=purrr::set_names(p2DetMetrics, p2AggMetrics)) %>%
pivot_longer(all_of(p2DetMetrics)) %>%
filter(!is.na(value))
p2 <- p2 +
geom_line(data=aggPlot, aes_string(color=aggVar, group=aggVar, y="value"), size=1.5)
if (!is.na(brewPalette)) p2 <- p2 +
scale_color_brewer("Cluster", palette=brewPalette) +
theme(legend.position="none")
# Create growth trends plot
if (TRUE) {
if(detVar=="countyFIPS") p3 <- dfDetail %>% filter(cluster != "999") else p3 <- dfDetail
p3 <- p3 %>%
colSelector(vecSelect=c(detVar, aggVar, "date", p3Metrics)) %>%
pivot_longer(all_of(p3Metrics)) %>%
filter(!is.na(value)) %>%
group_by_at(c(detVar, "name")) %>%
filter(date %in% c(max(date), max(date)-lubridate::days(p3Days))) %>%
mutate(growth=max(value)-min(value)) %>% # not ideal way to calculate
filter(date==max(date)) %>%
ungroup() %>%
ggplot(aes(x=value, y=growth))
if(detVar=="state") p3 <- p3 + geom_text(aes_string(color=aggVar, label=detVar), fontface="bold")
else p3 <- p3 + geom_point(aes_string(color=aggVar))
p3 <- p3 +
facet_wrap(~mapper[name], scales="free") +
labs(title=paste0("Current vs growth"),
subtitle=paste0("Dashed line represents ",
round(100*p3Slope),
"% growth rate over past ",
p3Days,
" days"
),
x="Most recent cumulative",
y=paste0("Growth over past ", p3Days, " days")
) +
lims(y=c(0, NA), x=c(0, NA)) +
theme(panel.background = element_rect(fill = "white", colour = "white"),
panel.grid.major = element_line(size = 0.25, linetype = 'solid', color = "grey")
) +
geom_abline(slope=p3Slope, intercept=0, lty=2)
if (!is.na(brewPalette)) {
p3 <- p3 + scale_color_brewer(stringr::str_to_title(aggVar), palette=brewPalette)
}
if(detVar=="countyFIPS") p3 <- p3 + labs(caption="Counties below population threshold excluded")
} else {
p3 <- NULL
}
# Return a list of plot objects
list(p1=p1, p2=p2, p3=p3)
}
# Updated function for handling county-level clusters
createSummary <- function(df,
stateClusterDF=NULL,
brewPalette=NA,
dataType="state"
) {
# FUNCTION ARGUMENTS:
# df: an integrated data frame by cluster-date
# stateClusterDF: a data frame containing state-cluster (NULL means it can be found in df)
# brewPalette: character string for a palette from RColorBrewer to be used (NA means default colors)
# dataType: the type of maps being produced ("state" or "county")
# Create plots that can be relevant for a dashboard, including:
# 1. Map of segments
# 2. Bar plot of counts by segment
# 3. Facetted bar plot of segment descriptors (e.g., population, burden per million)
# 4. Facetted trend-line plot of burden by segments
# 5. Sparse county cluster map (if dataType is "county")
# Create a map of the clusters
p1 <- helperSummaryMap(if(is.null(stateClusterDF)) df else stateClusterDF,
mapLevel=if(dataType=="state") "states" else "counties",
keyCol=if(dataType=="state") "state" else "countyFIPS",
discreteValues=TRUE,
labelScale=is.na(brewPalette),
textLabel=if(dataType=="state") c("RI", "CT", "DE", "MD", "DC") else c(),
extraArgs=if(is.na(brewPalette)) list() else
list("arg1"=scale_fill_brewer("Cluster", palette=brewPalette))
)
# Create a bar plot of counts by segment
p2 <- helperSummaryMap(if(is.null(stateClusterDF)) df else stateClusterDF,
mapLevel=if(dataType=="state") "states" else "counties",
keyCol=if(dataType=="state") "state" else "countyFIPS",
discreteValues=TRUE,
labelScale=is.na(brewPalette),
countOnly=TRUE,
extraArgs=if(is.na(brewPalette)) list() else
list("arg1"=scale_fill_brewer("Cluster", palette=brewPalette))
)
# Create plot for population and burden by cluster
p3 <- df %>%
helperAggTotal(aggVars=c("pop", "wm_tcpm7", "wm_tdpm7"),
mapper=c("pop"="Population (millions)",
"wm_tcpm7"="Cases per thousand",
"wm_tdpm7"="Deaths per million"
),
xLab=NULL,
yLab=NULL,
title=NULL,
divideBy=c("pop"=1000000, "wm_tcpm7"=1000),
extraArgs=if(is.na(brewPalette)) list() else
list("arg1"=scale_fill_brewer("Cluster", palette=brewPalette))
)
# Create plot for cumulative burden per million over time
p4xtra <- list(arg1=scale_x_date(date_breaks="2 months", date_labels="%b-%y"),
arg2=theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
)
if(!is.na(brewPalette)) p4xtra$arg3 <- scale_color_brewer("Cluster", palette=brewPalette)
p4 <- df %>%
helperAggTrend(aggVars=append(c("wm_tcpm7", "wm_tdpm7"), if(dataType=="state") "wm_hpm7" else NULL),
mapper=c("wm_tcpm7"="Cases per thousand\n(cumulative)",
"wm_tdpm7"="Deaths per million\n(cumulative)",
"wm_hpm7"="Hospitalized per million\n(current)"
),
yLab=NULL,
title=NULL,
divideBy=c("wm_tcpm7"=1000),
linesize=0.75,
extraArgs=p4xtra
)
# Create the sparse county plot (if it is county data) - assumes default that '999' is 'too small'
if (dataType=="county") {
vecFrame <- if(is.null(stateClusterDF)) df else stateClusterDF
vecFrame <- vecFrame %>% select(countyFIPS, cluster) %>%
distinct()
vecUse <- vecFrame$cluster %>% purrr::set_names(vecFrame$countyFIPS)
p5 <- sparseCountyClusterMap(vecUse,
caption="Counties below population threshold left blank",
brewPalette=if(is.na(brewPalette)) NULL else brewPalette
)
} else {
p5 <- NULL
}
list(p1=p1, p2=p2, p3=p3, p4=p4, p5=p5)
}
The updated functions are tested:
# Confirm that new cluster maps are working as intended
cty_chksegs_20210608 <- readRunUSAFacts(maxDate="2021-06-06",
dfPerCapita=cty_newsegs_20210608$dfPerCapita,
useClusters=cty_newsegs_20210608$useClusters,
skipAssessmentPlots=FALSE,
brewPalette="Paired"
)
## NULL
The updated maps blank out the ‘999’ (small population) cluster as desired. The clusters are designed using a 3x3 of total burden (mainly death) and shape of the curve. Shapes of the curve are plotted, normalized so that each curve sums to 100%:
p1Data <- cty_chksegs_20210608$plotDataList$dfAgg %>%
pivotData(pivotKeys=c("cluster", "date", "pop")) %>%
filter(!is.na(value), cluster!="999", name %in% c("wm_cpm7", "wm_dpm7")) %>%
group_by(cluster, name) %>%
mutate(pct=value/sum(value)) %>%
ungroup()
# Plot all together
p1Data %>%
ggplot(aes(x=date, y=pct)) +
geom_line(aes(group=cluster, color=cluster), lwd=1) +
facet_wrap(~c("wm_cpm7"="Rolling 7 cases", "wm_dpm7"="Rolling 7 deaths")[name]) +
scale_color_brewer(palette="Paired") +
labs(x=NULL,
y="Percentage of total burden",
main="Burden over time, normalized to 100% cumulative burden"
)
# Plot as facets
p1Data %>%
ggplot(aes(x=date, y=pct)) +
geom_line(aes(group=cluster, color=cluster), lwd=1) +
facet_grid(c("wm_cpm7"="Rolling 7 cases", "wm_dpm7"="Rolling 7 deaths")[name]~cluster) +
scale_color_brewer(palette="Paired") +
labs(x=NULL,
y="Percentage of total burden",
main="Burden over time, normalized to 100% cumulative burden"
)
Broadly, the shapes selected by the clustering include:
Broadly, there are examples of relatively higher and lower burden within each of these shapes, though the highest burden clusters mainly had two spikes with primary impact early and the medium burden clusters rarely had the primary impact early.
Next, all counties of at least 50,000 population are examined for the shape of the cumulative deaths curve:
p2Data <- cty_chksegs_20210608$plotDataList$dfFull %>%
select(countyFIPS, pop, state, date, tdpm7, tcpm7) %>%
pivotData(pivotKeys=c("countyFIPS", "state", "date", "pop")) %>%
filter(!is.na(value), pop>=50000) %>%
group_by(countyFIPS, name) %>%
mutate(pct=value/max(value)) %>%
ungroup()
p2Dist <- p2Data %>%
filter(name=="tdpm7") %>%
select(countyFIPS, date, pct) %>%
pivotData(pivotKeys="countyFIPS", toLonger = FALSE, nameVar="date", valVar="pct") %>%
column_to_rownames("countyFIPS") %>%
as.matrix() %>%
dist()
p2Complete <- hclust(p2Dist, method="complete")
plot(p2Complete)
p2Single <- hclust(p2Dist, method="single")
plot(p2Single)
There is something peculiar about data in ‘32510’. A plot is created:
p2Data %>%
ggplot(aes(x=date, y=pct)) +
geom_line(data=~filter(., countyFIPS=="32510"), aes(group=countyFIPS), color="red", lwd=2) +
geom_line(data=~summarize(group_by(., name, date), pct=mean(pct), .groups="drop")) +
facet_wrap(~c("tcpm7"="Rolling 7 cases", "tdpm7"="Rolling 7 deaths")[name]) +
labs(x=NULL, y="Percentage of total burden on day", title="Cluster 32510 is red, mean is black")
There is clearly a data issue with cluster 32510. Clusters are examined at a variable number of cut points:
plotHierarchical <- function(obj, k, df) {
# FUNCtION ARGuMENTS:
# obj: clustering object from hierarchical clustering
# k: number of clusters to create
# df: data frame with burden data
# Counts by cluster
ctCluster <- cutree(obj, k=k) %>%
table() %>%
print()
# Make clustering data frame
clData <- cutree(obj, k=k) %>%
clustersToFrame(colNameName="countyFIPS") %>%
joinFrames(df) %>%
group_by(countyFIPS, name) %>%
mutate(delta=ifelse(row_number()==1, pct, pct-lag(pct))) %>%
ungroup() %>%
group_by(cluster, date, name) %>%
summarize(p05cum=quantile(pct, 0.05),
mucum=mean(pct),
p95cum=quantile(pct, 0.95),
p05delta=quantile(delta, 0.05),
mudelta=mean(delta),
p95delta=quantile(delta, 0.95),
.groups="drop"
)
# Plot by cluster and metric
p1 <- clData %>%
ggplot(aes(x=date)) +
geom_line(aes(color=cluster, y=mucum), lwd=1) +
geom_ribbon(aes(ymin=p05cum, ymax=p95cum), alpha=0.25) +
facet_grid(c("tcpm7"="% cumulative cases", "tdpm7"="% cumulative deaths")[name]~cluster) +
scale_color_brewer(palette="Set1") +
labs(x=NULL, y="% of cumulative", title="Mean and 90% range by cluster and metric")
print(p1)
# Plot incremental rather than cumulative
p2 <- clData %>%
ggplot(aes(x=date)) +
geom_line(aes(color=cluster, y=mudelta), lwd=1) +
geom_ribbon(aes(ymin=p05delta, ymax=p95delta), alpha=0.25) +
facet_grid(c("tcpm7"="% total cases", "tdpm7"="% total deaths")[name]~cluster) +
scale_color_brewer(palette="Set1") +
labs(x=NULL, y="% of total", title="Mean and 90% range by cluster and metric")
print(p2)
# Return the clustering frame
clData
}
plotList <- lapply(2:5, FUN=function(x) plotHierarchical(p2Complete, k=x, df=p2Data))
## .
## 1 2
## 841 149
## Joining, by = "countyFIPS"
## .
## 1 2 3
## 773 68 149
## Joining, by = "countyFIPS"
## .
## 1 2 3 4
## 498 275 68 149
## Joining, by = "countyFIPS"
## .
## 1 2 3 4 5
## 498 275 68 114 35
## Joining, by = "countyFIPS"
At a glance, with 5 clusters, there is a clear early cluster, a clear early/late cluster, and a group of three seemingly similar clusters (primarily late). Further exploration of these three clusters may be merited.
The five clusters are examined on a single plot:
tempPlotter <- function(df, vecLate, facetScales="free_y", showRibbon=TRUE, showCum=FALSE) {
p1 <- df %>%
mutate(lateCluster=(cluster %in% vecLate)) %>%
ggplot(aes(x=date)) +
geom_line(aes(group=cluster, color=cluster, y=if(showCum) mucum else mudelta), lwd=1) +
facet_grid(lateCluster~c("tcpm7"="% total cases", "tdpm7"="% total deaths")[name],
scales=facetScales
) +
scale_color_brewer(palette="Set1") +
labs(x=NULL,
y=NULL,
title="Disease burden by time period and shape cluster"
)
if (isTRUE(showRibbon)) {
p1 <- p1 +
geom_ribbon(aes(ymin=if(showCum) p05cum else p05delta,
ymax=if(showCum) p95cum else p95delta,
fill=cluster,
group=cluster
), alpha=0.25) +
scale_fill_brewer(palette="Set1") +
labs(subtitle="Shaded region covers 90% of observations")
}
p1
}
tempPlotter(plotList[[4]], vecLate=c(4, 5), showRibbon=FALSE)
tempPlotter(plotList[[4]], vecLate=c(4, 5), showRibbon=FALSE, showCum=TRUE)
tempPlotter(plotList[[4]], vecLate=c(4, 5))
tempPlotter(plotList[[4]], vecLate=c(4, 5), showCum=TRUE)
Visually, there are meaningful distinctions in the clusters when overlaid, particularly with the shape of the cumulative curve:
The geographic locations of the five hierarchical cluster types are plotted:
tempMapper <- function(obj,
k,
regions="counties",
varName=if(regions=="counties") "fips" else "state",
values="cluster",
lvlOrder=NULL,
title=if(is.null(lvlOrder)) NULL else "Lightest colors have earliest burden",
caption=NULL
) {
df <- cutree(obj, k=k) %>%
clustersToFrame(colNameName=varName)
if (!is.null(lvlOrder)) df[[values]] <- factor(df[[values]], levels=lvlOrder)
df %>%
sparseCountyClusterMap(brewPalette="viridis", caption=caption) +
labs(title=title)
}
tmpText <- "Plots only counties with at least 50k population"
tempMapper(p2Complete, k=2, lvlOrder=c(1, 2), caption=tmpText)
tempMapper(p2Complete, k=3, lvlOrder=c(2, 1, 3), caption=tmpText)
tempMapper(p2Complete, k=4, lvlOrder=c(3, 2, 1, 4), caption=tmpText)
tempMapper(p2Complete, k=5, lvlOrder=c(3, 2, 1, 4, 5), caption=tmpText)
Maps are created to show when counties first cleared specific hurdles for cumulative deaths per million:
tempBurdenMapper <- function(df,
popMin=1,
popMax=Inf,
tgtVar="tdpm",
tgtBPM=1000,
fn=lubridate::quarter,
title=paste0("Time when county first hit ", tgtBPM, " on metric ", tgtVar),
caption=paste("Plots counties with population between", popMin, "and", popMax),
...
) {
p1 <- df %>%
filter(pop < popMax, pop >= popMin) %>%
group_by(countyFIPS) %>%
summarize(bpmMax=max(get(tgtVar), na.rm=TRUE),
keyDate=as.Date(specNA(min)(ifelse(get(tgtVar) >= tgtBPM, date, NA)), origin="1970-01-01"),
.groups="drop"
) %>%
mutate(cluster=fn(keyDate, ...),
cluster=ifelse(is.na(cluster), "not yet", cluster),
fips=countyFIPS
) %>%
sparseCountyClusterMap(brewPalette = "viridis", caption=caption)
p1 + labs(title=title)
}
# Looking at tdpm of 500, 1000, 5000
tempBurdenMapper(cty_chksegs_20210608$plotDataList$dfFull, tgtBPM=500, with_year=TRUE)
tempBurdenMapper(cty_chksegs_20210608$plotDataList$dfFull, tgtBPM=1000, with_year=TRUE)
tempBurdenMapper(cty_chksegs_20210608$plotDataList$dfFull, tgtBPM=5000, with_year=TRUE)
# Looking specifically at 2500 tdpm
tempBurdenMapper(cty_chksegs_20210608$plotDataList$dfFull, tgtBPM=2500, with_year=TRUE)
tempBurdenMapper(cty_chksegs_20210608$plotDataList$dfFull, popMin=25000, tgtBPM=2500, with_year=TRUE)
tempBurdenMapper(cty_chksegs_20210608$plotDataList$dfFull, popMax=25000, tgtBPM=2500, with_year=TRUE)
Next steps are to create curves using larger counties, then assess how well they fit the shape of the curves in the smaller counties.
A function is created to calculate the distances from each of the cluster means:
tempDistCluster <- function(dfClust, dfAll, metType="tdpm7", metClust="mucum") {
dfClust <- dfClust %>%
filter(name==metType) %>%
colSelector(vecSelect=c("cluster", "date", metClust)) %>%
colRenamer(vecRename=c("pct") %>% purrr::set_names(metClust))
dfAll <- dfAll %>%
filter(name==metType) %>%
colSelector(vecSelect=c("countyFIPS", "date", "pct"))
dfClust %>%
left_join(dfAll, by="date") %>%
mutate(delta2=(ifelse(is.na(pct.x), 0, pct.x)-ifelse(is.na(pct.y), 0, pct.y))**2) %>%
group_by(cluster, countyFIPS) %>%
summarize(dist=sum(delta2), .groups="drop") %>%
arrange(countyFIPS, dist)
}
# Create data for all counties
p2All <- cty_chksegs_20210608$plotDataList$dfFull %>%
select(countyFIPS, pop, state, date, tdpm7, tcpm7) %>%
pivotData(pivotKeys=c("countyFIPS", "state", "date", "pop")) %>%
filter(!is.na(value), pop>0) %>%
group_by(countyFIPS, name) %>%
mutate(pct=value/max(value)) %>%
ungroup()
# Calculate distances to cluster mean, then print
countyClusterDist <- tempDistCluster(dfClust=plotList[[4]], dfAll=p2All)
countyClusterDist
## # A tibble: 15,710 x 3
## cluster countyFIPS dist
## <ord> <chr> <dbl>
## 1 1 01001 1.20
## 2 2 01001 3.10
## 3 3 01001 5.48
## 4 4 01001 14.8
## 5 5 01001 48.8
## 6 2 01003 0.938
## 7 1 01003 1.88
## 8 3 01003 7.53
## 9 4 01003 19.3
## 10 5 01003 58.0
## # ... with 15,700 more rows
# Assess overlap of cluster for counties already clusterd
tmpDistDF <- cutree(p2Complete, k=5) %>%
vecToTibble(colNameName="countyFIPS", colNameData="hierCluster") %>%
left_join(countyClusterDist, by="countyFIPS") %>%
arrange(countyFIPS, dist)
tmpDistDF %>%
group_by(countyFIPS) %>%
filter(row_number()==1) %>%
ungroup() %>%
mutate(cluster=as.integer(as.character(cluster))) %>%
count(hierCluster, cluster) %>%
group_by(hierCluster) %>%
mutate(pct=n/sum(n)) %>%
ungroup() %>%
ggplot(aes(x=cluster, y=hierCluster)) +
geom_tile(aes(fill=pct)) +
geom_text(aes(label=paste0("n=", n, "\n(", round(100*pct), "%)"))) +
scale_fill_continuous(low="white", high="lightgreen") +
labs(title="Overlap of assigned hierarchical cluster and closest hierarchical cluster mean",
x="Closest hierarchical cluster mean (k=5)",
y="Assigned hierarchical cluster (k=5)"
)
Most counties are closest to the mean of the hierarchical cluster they are assigned to. There are some differences, as expected since method=“complete” was chosen for the hierarchical clusters. Assignments are then made for all counties, and then plotted:
# Full clustering database
distOnlyClust <- countyClusterDist %>%
arrange(countyFIPS, dist) %>%
group_by(countyFIPS) %>%
filter(row_number()==1) %>%
ungroup() %>%
left_join(cutree(p2Complete, k=5) %>%
vecToTibble(colNameName="countyFIPS", colNameData="hierCluster"),
by="countyFIPS"
)
# Cluster assignments depending on size of county
distOnlyClust %>%
ggplot(aes(x=cluster)) +
geom_bar(fill="lightblue") +
geom_text(stat="count", aes(y=..count../2, label=..count..)) +
facet_wrap(~ifelse(is.na(hierCluster), "Not in hierarchical data", "In hierarchical data"),
scales="free_y"
) +
labs(title="Closest hierarchical cluster")
# Plot all counties based on closest cluster
sparseCountyClusterMap(rename(distOnlyClust, fips=countyFIPS) %>%
mutate(cluster=factor(cluster, levels=c(3, 2, 1, 4, 5))),
brewPalette="viridis",
caption="Closest hierarchical cluster mean (euclidean distance of % cumulative death)"
) +
labs(title="Lighter clusters have higher percentage of burden early")
There are clear geographic patterns to the data, with counties having earlier disease generally being surrounded by other counties with (relatively) earlier disease.
The latest burden is also included:
p2Merge <- p2All %>%
filter(date==max(date), name=="tdpm7") %>%
select(countyFIPS, pop, state, date, tdpm7=value) %>%
inner_join(distOnlyClust, by=c("countyFIPS"))
p2Merge %>%
mutate(countySize=ifelse(pop>=50000, ">50k", "<50k")) %>%
ggplot(aes(x=tdpm7)) +
geom_density(aes(color=cluster), size=1) +
facet_wrap(~countySize) +
scale_color_brewer(palette="Set1") +
labs(x="Cumulative deaths per million", y="Relative density", title="Deaths vs shape segments")
Smaller counties are more likely to be very low or very high on deaths per million, as expected given the smaller denominators. Among the larger counties, segment 5 appears to generally have higher deaths, a trend that does not appear to hold in smaller counties.
ECDF curves are created as well:
p2Merge %>%
mutate(countySize=ifelse(pop>=50000, ">50k", "<50k")) %>%
arrange(cluster, countySize, tdpm7) %>%
group_by(cluster, countySize) %>%
mutate(prop=row_number()/n()) %>%
ungroup() %>%
ggplot(aes(x=tdpm7, y=prop)) +
geom_line(aes(group=countySize, color=countySize), size=1) +
facet_wrap(~cluster) +
scale_color_brewer(palette="Set1") +
labs(x="Cumulative deaths per million",
y="ECDF",
title="Deaths vs shape segments",
subtitle="Includes all counties (even with 0 deaths)"
)
p2Merge %>%
mutate(countySize=ifelse(pop>=50000, ">50k", "<50k")) %>%
filter(tdpm7>0) %>%
arrange(cluster, countySize, tdpm7) %>%
group_by(cluster, countySize) %>%
mutate(prop=row_number()/n()) %>%
ungroup() %>%
ggplot(aes(x=tdpm7, y=prop)) +
geom_line(aes(group=cluster, color=cluster), size=1) +
facet_wrap(~countySize) +
scale_color_brewer(palette="Set1") +
labs(x="Cumulative deaths per million",
y="ECDF",
title="Deaths vs shape segments",
subtitle="Includes only counties with 1+ deaths"
)
Larger counties appear to generally have had lower burdens, with the exception of very low burden, where there is some crossing of the curves at around 500 deaths per million.
Suppose that counties are clustered based on cumulative deaths (not percentage) by month, with focus on larger counties first:
# Include all counties with population greater than 0
p2DataAll <- cty_chksegs_20210608$plotDataList$dfFull %>%
select(countyFIPS, pop, state, date, tdpm7, tcpm7) %>%
pivotData(pivotKeys=c("countyFIPS", "state", "date", "pop")) %>%
filter(!is.na(value), pop>=1) %>%
group_by(countyFIPS, name) %>%
mutate(pct=value/max(value)) %>%
ungroup()
# Create distance matrix and hierarchical clusters
p2DistValue100k <- p2DataAll %>%
filter(name=="tdpm7", pop>=100000) %>%
select(countyFIPS, date, value) %>%
pivotData(pivotKeys="countyFIPS", toLonger = FALSE, nameVar="date", valVar="value") %>%
column_to_rownames("countyFIPS") %>%
as.matrix() %>%
dist()
# Run clustering with method "complete"
p2CompleteValue100k <- hclust(p2DistValue100k, method="complete")
plot(p2CompleteValue100k)
There appears to be a branch of counties that are unique:
cutree(p2CompleteValue100k, k=2) %>%
vecToTibble(colNameName="countyFIPS", colNameData="cluster") %>%
group_by(cluster) %>%
mutate(n=n()) %>%
ungroup() %>%
filter(n==min(n)) %>%
left_join(p2DataAll, by="countyFIPS") %>%
left_join(select(getCountyData(), countyFIPS, countyName, state)) %>%
filter(name=="tdpm7") %>%
mutate(countyName=paste0(countyFIPS, " - ",
stringr::str_replace(countyName, " County", ""),
" (",
state,
")"
)
) %>%
ggplot(aes(x=date, y=value)) +
geom_line() +
facet_wrap(~countyName) +
labs(x=NULL, y="Deaths per million", title="Counties in the first, small segment")
## Joining, by = c("countyFIPS", "state")
These counties have high total burdens combined with shapes differing from the norm. This leads them to become their own clusters, requiring a greater number of clusters to get at the main groupings:
# Updated to allow for variable other than pct
plotHierarchical <- function(obj, k, df, keyVar="pct", facetScale="fixed") {
# FUNCtION ARGuMENTS:
# obj: clustering object from hierarchical clustering
# k: number of clusters to create
# df: data frame with burden data
# keyVar: the key variable to use in df
# Counts by cluster
ctCluster <- cutree(obj, k=k) %>%
table() %>%
print()
# Make clustering data frame
clData <- cutree(obj, k=k) %>%
clustersToFrame(colNameName="countyFIPS") %>%
joinFrames(df) %>%
group_by(countyFIPS, name) %>%
mutate(delta=ifelse(row_number()==1, get(keyVar), get(keyVar)-lag(get(keyVar)))) %>%
ungroup() %>%
group_by(cluster, date, name) %>%
summarize(p05cum=quantile(get(keyVar), 0.05),
mucum=mean(get(keyVar)),
p95cum=quantile(get(keyVar), 0.95),
p05delta=quantile(delta, 0.05),
mudelta=mean(delta),
p95delta=quantile(delta, 0.95),
.groups="drop"
)
# Plot by cluster and metric
p1 <- clData %>%
ggplot(aes(x=date)) +
geom_line(aes(color=cluster, y=mucum), lwd=1) +
geom_ribbon(aes(ymin=p05cum, ymax=p95cum), alpha=0.25) +
facet_grid(c("tcpm7"="Cumulative cases", "tdpm7"="Cumulative deaths")[name]~cluster,
scales=facetScale
) +
scale_color_brewer(palette="Set1") +
labs(x=NULL, y="Cumulative", title="Mean and 90% range by cluster and metric")
print(p1)
# Plot incremental rather than cumulative
p2 <- clData %>%
ggplot(aes(x=date)) +
geom_line(aes(color=cluster, y=mudelta), lwd=1) +
geom_ribbon(aes(ymin=p05delta, ymax=p95delta), alpha=0.25) +
facet_grid(c("tcpm7"="Cases", "tdpm7"="Deaths")[name]~cluster, scales=facetScale) +
scale_color_brewer(palette="Set1") +
labs(x=NULL, y="Burden", title="Mean and 90% range by cluster and metric")
print(p2)
# Return the clustering frame
clData
}
plotListValue100k <- lapply(2:9,
FUN=function(x) plotHierarchical(p2CompleteValue100k,
k=x,
df=filter(p2DataAll, pop>=100000),
keyVar="value",
facetScale="free_y"
)
)
## .
## 1 2
## 586 13
## Joining, by = "countyFIPS"
## .
## 1 2 3
## 319 267 13
## Joining, by = "countyFIPS"
## .
## 1 2 3 4
## 319 267 4 9
## Joining, by = "countyFIPS"
## .
## 1 2 3 4 5
## 319 236 4 31 9
## Joining, by = "countyFIPS"
## .
## 1 2 3 4 5 6
## 319 236 4 31 7 2
## Joining, by = "countyFIPS"
## .
## 1 2 3 4 5 6 7
## 319 81 155 4 31 7 2
## Joining, by = "countyFIPS"
## .
## 1 2 3 4 5 6 7 8
## 319 81 155 4 23 8 7 2
## Joining, by = "countyFIPS"
## .
## 1 2 3 4 5 6 7 8 9
## 319 81 144 4 23 11 8 7 2
## Joining, by = "countyFIPS"
With 9 segments, the following shapes and burdens (using deaths as the metric) are noted:
Separating counties with very high burden and very low burden appears reasonable. Clustering can be re-run with only the counties showing moderate disease burden, where shape is a more meaningful factor.
Segments are selected using k=8, with the four smallest buckets consolidated:
dfClust100k_k8 <- cutree(p2CompleteValue100k, k=8) %>%
vecToTibble(colNameData="tempCluster", colNameName="countyFIPS") %>%
group_by(tempCluster) %>%
mutate(cluster=factor(ifelse(n()>=20, tempCluster, 9))) %>%
ungroup()
dfClust100k_k8
## # A tibble: 599 x 3
## countyFIPS tempCluster cluster
## <chr> <int> <fct>
## 1 01003 1 1
## 2 01015 2 2
## 3 01055 2 2
## 4 01069 2 2
## 5 01073 2 2
## 6 01081 1 1
## 7 01089 1 1
## 8 01097 3 3
## 9 01101 2 2
## 10 01103 2 2
## # ... with 589 more rows
count(dfClust100k_k8, cluster)
## # A tibble: 5 x 2
## cluster n
## <fct> <int>
## 1 1 319
## 2 2 81
## 3 3 155
## 4 9 21
## 5 5 23
The plotHierarchical function is updated to allow for passing clusters to it directly:
# Updated to allow for variable other than pct and obj that is of class data.frame
plotHierarchical <- function(obj, k, df, keyVar="pct", facetScale="fixed") {
# FUNCtION ARGuMENTS:
# obj: clustering object from hierarchical clustering OR df with countyFIPS-cluster
# k: number of clusters to create
# df: data frame with burden data
# keyVar: the key variable to use in df
# Convert obj to a data frame if obj has been passed as an hclust object
if (!("data.frame") %in% class(obj)) {
if (!("hclust") %in% class(obj))
stop("\nMust pass object that is member of class data.frame or hclust\n")
# Print counts by cluster
cutree(obj, k=k) %>%
table() %>%
print()
# Convert obj to relevant data frame
obj <- cutree(obj, k=k) %>%
clustersToFrame(colNameName="countyFIPS")
}
# Make clustering data frame (obj has been converted to data.frame above if not already passed that way)
clData <- obj %>%
joinFrames(df) %>%
group_by(countyFIPS, name) %>%
mutate(delta=ifelse(row_number()==1, get(keyVar), get(keyVar)-lag(get(keyVar)))) %>%
ungroup() %>%
group_by(cluster, date, name) %>%
summarize(p05cum=quantile(get(keyVar), 0.05),
mucum=mean(get(keyVar)),
p95cum=quantile(get(keyVar), 0.95),
p05delta=quantile(delta, 0.05),
mudelta=mean(delta),
p95delta=quantile(delta, 0.95),
.groups="drop"
)
# Plot by cluster and metric
p1 <- clData %>%
ggplot(aes(x=date)) +
geom_line(aes(color=cluster, y=mucum), lwd=1) +
geom_ribbon(aes(ymin=p05cum, ymax=p95cum), alpha=0.25) +
facet_grid(c("tcpm7"="Cumulative cases", "tdpm7"="Cumulative deaths")[name]~cluster,
scales=facetScale
) +
scale_color_brewer(palette="Set1") +
labs(x=NULL, y="Cumulative", title="Mean and 90% range by cluster and metric")
print(p1)
# Plot incremental rather than cumulative
p2 <- clData %>%
ggplot(aes(x=date)) +
geom_line(aes(color=cluster, y=mudelta), lwd=1) +
geom_ribbon(aes(ymin=p05delta, ymax=p95delta), alpha=0.25) +
facet_grid(c("tcpm7"="Cases", "tdpm7"="Deaths")[name]~cluster, scales=facetScale) +
scale_color_brewer(palette="Set1") +
labs(x=NULL, y="Burden", title="Mean and 90% range by cluster and metric")
print(p2)
# Return the clustering frame
clData
}
clData_100k_k8 <- plotHierarchical(obj=select(dfClust100k_k8, countyFIPS, cluster),
df=p2DataAll,
keyVar="value",
facetScale="free_y"
)
## Joining, by = "countyFIPS"
Next, the existing algorithm is updated to assign every cluster to the closest cluster mean:
tempDistCluster <- function(dfClust, dfAll, metType="tdpm7", metClust="mucum", typeUse="pct") {
dfClust <- dfClust %>%
filter(name==metType) %>%
colSelector(vecSelect=c("cluster", "date", metClust)) %>%
colRenamer(vecRename=c("keyMetric") %>% purrr::set_names(metClust))
dfAll <- dfAll %>%
filter(name==metType) %>%
colSelector(vecSelect=c("countyFIPS", "date", typeUse)) %>%
colRenamer(vecRename=c("keyMetric") %>% purrr::set_names(typeUse))
dfClust %>%
left_join(dfAll, by="date") %>%
mutate(dx=ifelse(is.na(keyMetric.x), 0, keyMetric.x),
dy=ifelse(is.na(keyMetric.y), 0, keyMetric.y),
delta2=(dx-dy)**2
) %>%
group_by(cluster, countyFIPS) %>%
summarize(dist=sum(delta2), .groups="drop") %>%
arrange(countyFIPS, dist)
}
# Calculate distances to cluster mean, then print
countyClusterDist_new <- tempDistCluster(dfClust=clData_100k_k8, dfAll=p2DataAll, typeUse="value")
countyClusterDist_new
## # A tibble: 15,710 x 3
## cluster countyFIPS dist
## <fct> <chr> <dbl>
## 1 3 01001 9569283.
## 2 2 01001 67090707.
## 3 1 01001 88240896.
## 4 5 01001 353564016.
## 5 9 01001 983953954.
## 6 1 01003 14296132.
## 7 3 01003 59333163.
## 8 2 01003 191161474.
## 9 5 01003 563092256.
## 10 9 01003 1367670103.
## # ... with 15,700 more rows
# Assess overlap of cluster for counties already clustered
tmpDistDF_new <- dfClust100k_k8 %>%
select(countyFIPS, hierCluster=cluster) %>%
left_join(countyClusterDist_new, by="countyFIPS") %>%
arrange(countyFIPS, dist)
tmpDistDF_new %>%
group_by(countyFIPS) %>%
filter(row_number()==1) %>%
ungroup() %>%
mutate(cluster=factor(as.integer(as.character(cluster)))) %>%
count(hierCluster, cluster) %>%
group_by(hierCluster) %>%
mutate(pct=n/sum(n)) %>%
ungroup() %>%
ggplot(aes(x=cluster, y=hierCluster)) +
geom_tile(aes(fill=pct)) +
geom_text(aes(label=paste0("n=", n, "\n(", round(100*pct), "%)"))) +
scale_fill_continuous(low="white", high="lightgreen") +
labs(title="Overlap of assigned hierarchical cluster and closest hierarchical cluster mean",
x="Closest hierarchical cluster mean (k=5)",
y="Assigned hierarchical cluster (k=5)"
)
While there are some minor differences, overwhelmingly counties are already aligned to the closest cluster mean. The approach is then extended to counties with populations under 100k:
# Full clustering database
distOnlyClust_new <- countyClusterDist_new %>%
arrange(countyFIPS, dist) %>%
group_by(countyFIPS) %>%
filter(row_number()==1) %>%
ungroup() %>%
left_join(select(dfClust100k_k8, countyFIPS, hierCluster=cluster), by="countyFIPS")
# Cluster assignments depending on size of county
distOnlyClust_new %>%
ggplot(aes(x=cluster)) +
geom_bar(fill="lightblue") +
geom_text(stat="count", aes(y=..count../2, label=..count..)) +
facet_wrap(~ifelse(is.na(hierCluster), "Not in hierarchical data", "In hierarchical data"),
scales="free_y"
) +
labs(title="Closest hierarchical cluster")
# Plot all counties based on closest cluster
sparseCountyClusterMap(rename(distOnlyClust_new, fips=countyFIPS) %>%
mutate(cluster=factor(cluster, levels=c(1, 3, 2, 9, 5))),
brewPalette="viridis",
caption="Closest hierarchical cluster mean (euclidean distance of tdpm7)"
) +
labs(title="Lighter clusters generally have higher/earlier burden")
There continue to be meaningful geographic trends, with counties frequently adjacent to other counties in the same segment. Smaller counties are over-represented in groups with relatively high burden (9 and 2).
Next steps are to try to bring the shape dimension slightly more to the surface (in particular for segments 2 and 3 which have a fat 90% range early in the pandemic).
Data for segments 2 and 3 are extracted, then clustering is run solely for these groups:
# Create data containing only clusters 2 and 3
p2Data_Seg_23 <- distOnlyClust_new %>%
filter(hierCluster %in% c(2, 3)) %>%
select(countyFIPS, hierCluster) %>%
left_join(p2DataAll, by="countyFIPS")
p2Data_Seg_23
## # A tibble: 234,112 x 8
## countyFIPS hierCluster pop state date name value pct
## <chr> <fct> <dbl> <chr> <date> <chr> <dbl> <dbl>
## 1 01015 2 113605 AL 2020-01-25 tdpm7 0 0
## 2 01015 2 113605 AL 2020-01-25 tcpm7 0 0
## 3 01015 2 113605 AL 2020-01-26 tdpm7 0 0
## 4 01015 2 113605 AL 2020-01-26 tcpm7 0 0
## 5 01015 2 113605 AL 2020-01-27 tdpm7 0 0
## 6 01015 2 113605 AL 2020-01-27 tcpm7 0 0
## 7 01015 2 113605 AL 2020-01-28 tdpm7 0 0
## 8 01015 2 113605 AL 2020-01-28 tcpm7 0 0
## 9 01015 2 113605 AL 2020-01-29 tdpm7 0 0
## 10 01015 2 113605 AL 2020-01-29 tcpm7 0 0
## # ... with 234,102 more rows
# Create distance matrix and hierarchical clusters
p2Dist_Seg_23 <- p2Data_Seg_23 %>%
filter(name=="tdpm7") %>%
select(countyFIPS, date, value) %>%
pivotData(pivotKeys="countyFIPS", toLonger = FALSE, nameVar="date", valVar="value") %>%
column_to_rownames("countyFIPS") %>%
as.matrix() %>%
dist()
# Run clustering with method "complete"
p2Complete_Seg_23 <- hclust(p2Dist_Seg_23, method="complete")
plot(p2Complete_Seg_23)
# Plot cluster summaries
plotList_Seg_23 <- lapply(2:5,
FUN=function(x) plotHierarchical(p2Complete_Seg_23,
k=x,
df=filter(p2Data_Seg_23, pop>=100000),
keyVar="value",
facetScale="free_y"
)
)
## .
## 1 2
## 81 155
## Joining, by = "countyFIPS"
## .
## 1 2 3
## 81 144 11
## Joining, by = "countyFIPS"
## .
## 1 2 3 4
## 48 144 33 11
## Joining, by = "countyFIPS"
## .
## 1 2 3 4 5
## 46 2 144 33 11
## Joining, by = "countyFIPS"
# Create the four cluster summary
dfClust_Seg_23_k4 <- cutree(p2Complete_Seg_23, k=4) %>%
vecToTibble(colNameData="tempCluster", colNameName="countyFIPS") %>%
mutate(cluster=factor(tempCluster))
# Comparison to original hierarchical cluster
dfClust_Seg_23_k4 %>%
left_join(select(distOnlyClust_new, countyFIPS, hierCluster), by="countyFIPS") %>%
count(hierCluster, cluster)
## # A tibble: 4 x 3
## hierCluster cluster n
## <fct> <fct> <int>
## 1 2 1 48
## 2 2 3 33
## 3 3 2 144
## 4 3 4 11
# Map of the new clusters
dfClust_Seg_23_k4 %>%
select(fips=countyFIPS, cluster) %>%
usmap::plot_usmap(regions="counties", data=., values="cluster") +
scale_fill_brewer("Cluster", palette="Set1")
# Plot the four cluster summary
clData_Seg_23_k4 <- plotHierarchical(obj=select(dfClust_Seg_23_k4, countyFIPS, cluster),
df=p2Data_Seg_23,
keyVar="value",
facetScale="free_y"
)
## Joining, by = "countyFIPS"
As expected, each of the original segments is split, helping to bring out a mix of shape and total burden:
While small, there is meaningful value in the ~2000 deaths per million cluster with early disease. It is focused on major metros (Chicago, Detroit, Philadelphia, New Orleans, NYC collar) that had early spikes similar to core NYC counties but without the associated high levels of total burden. These are distinct from the much larger group of counties that got to ~2000 deaths per million with a much later timing.
Next steps are to create a full clustering database and extend to counties with under 100k population.
First, the segments for counties with 100k+ are combined:
dfClust100k_full <- dfClust100k_k8 %>%
select(countyFIPS, origCluster=cluster) %>%
full_join(select(dfClust_Seg_23_k4, countyFIPS, newCluster=cluster), by="countyFIPS") %>%
mutate(finalCluster=ifelse(is.na(newCluster),
as.integer(as.character(origCluster)),
10*as.integer(as.character(origCluster)) + as.integer(as.character(newCluster))
),
finalCluster=factor(finalCluster, levels=sort(unique(finalCluster)))
)
dfClust100k_full
## # A tibble: 599 x 4
## countyFIPS origCluster newCluster finalCluster
## <chr> <fct> <fct> <fct>
## 1 01003 1 <NA> 1
## 2 01015 2 1 21
## 3 01055 2 1 21
## 4 01069 2 1 21
## 5 01073 2 1 21
## 6 01081 1 <NA> 1
## 7 01089 1 <NA> 1
## 8 01097 3 2 32
## 9 01101 2 3 23
## 10 01103 2 1 21
## # ... with 589 more rows
dfClust100k_full %>%
count(origCluster, newCluster, finalCluster)
## # A tibble: 7 x 4
## origCluster newCluster finalCluster n
## <fct> <fct> <fct> <int>
## 1 1 <NA> 1 319
## 2 2 1 21 48
## 3 2 3 23 33
## 4 3 2 32 144
## 5 3 4 34 11
## 6 9 <NA> 9 21
## 7 5 <NA> 5 23
# Plot the seven cluster summary
clData_Seg_100k_full <- plotHierarchical(obj=select(dfClust100k_full, countyFIPS, cluster=finalCluster),
df=p2DataAll,
keyVar="value",
facetScale="free_y"
)
## Joining, by = "countyFIPS"
# Map of the new clusters
dfClust100k_full %>%
select(fips=countyFIPS, cluster=finalCluster) %>%
usmap::plot_usmap(regions="counties", data=., values="cluster") +
scale_fill_brewer("Cluster", palette="Set1")
Smaller counties are then mapped to the nearest segment average:
# Calculate distances to cluster mean, then print
countyClusterDist_100k_full <- tempDistCluster(dfClust=clData_Seg_100k_full, dfAll=p2All, typeUse="value")
countyClusterDist_100k_full
## # A tibble: 21,994 x 3
## cluster countyFIPS dist
## <fct> <chr> <dbl>
## 1 32 01001 6770143.
## 2 21 01001 46397990.
## 3 1 01001 88240896.
## 4 34 01001 123376092.
## 5 23 01001 125257271.
## 6 5 01001 353564016.
## 7 9 01001 983953954.
## 8 1 01003 14296132.
## 9 32 01003 51092603.
## 10 21 01003 143224481.
## # ... with 21,984 more rows
# Assess overlap of cluster for counties already clusterd
tmpDistDF_100k_full <- dfClust100k_full %>%
select(countyFIPS, finalCluster) %>%
left_join(countyClusterDist_100k_full, by="countyFIPS") %>%
arrange(countyFIPS, dist)
tmpDistDF_100k_full %>%
group_by(countyFIPS) %>%
filter(row_number()==1) %>%
ungroup() %>%
mutate(cluster=factor(as.integer(as.character(cluster)))) %>%
count(hierCluster=finalCluster, cluster) %>%
group_by(hierCluster) %>%
mutate(pct=n/sum(n)) %>%
ungroup() %>%
ggplot(aes(x=cluster, y=hierCluster)) +
geom_tile(aes(fill=pct)) +
geom_text(aes(label=paste0("n=", n, "\n(", round(100*pct), "%)"))) +
scale_fill_continuous(low="white", high="lightgreen") +
labs(title="Overlap of assigned hierarchical cluster and closest hierarchical cluster mean",
x="Closest hierarchical cluster mean (k=5)",
y="Assigned hierarchical cluster (k=5)"
)
# Full clustering database
distOnlyClust_100k_full <- countyClusterDist_100k_full %>%
arrange(countyFIPS, dist) %>%
group_by(countyFIPS) %>%
filter(row_number()==1) %>%
ungroup() %>%
left_join(select(dfClust100k_full, countyFIPS, hierCluster=finalCluster), by="countyFIPS")
# Cluster assignments depending on size of county
distOnlyClust_100k_full %>%
ggplot(aes(x=cluster)) +
geom_bar(fill="lightblue") +
geom_text(stat="count", aes(y=..count../2, label=..count..)) +
facet_wrap(~ifelse(is.na(hierCluster), "Not in hierarchical data", "In hierarchical data"),
scales="free_y"
) +
labs(title="Closest hierarchical cluster")
# Plot all counties based on closest cluster
sparseCountyClusterMap(rename(distOnlyClust_100k_full, fips=countyFIPS) %>%
mutate(cluster=factor(cluster, levels=c(1, 32, 34, 21, 23, 5, 9))),
brewPalette="viridis",
caption="Closest hierarchical cluster mean (euclidean distance of % cumulative death)"
) +
labs(title="Lighter clusters have higher burden and/or higher percentage of burden early")
There continues to be meaningful patterns by geography. Next steps are to download the latest data and run the full segment diagnostics.
New data are downloaded, with the above segments applied:
readList <- list("usafCase"="./RInputFiles/Coronavirus/covid_confirmed_usafacts_downloaded_20210622.csv",
"usafDeath"="./RInputFiles/Coronavirus/covid_deaths_usafacts_downloaded_20210622.csv"
)
compareList <- list("usafCase"=cty_newdata_20210608$dfRaw$usafCase,
"usafDeath"=cty_newdata_20210608$dfRaw$usafDeath
)
dfClust <- distOnlyClust_100k_full %>%
select(countyFIPS, cluster) %>%
mutate(cluster=as.character(cluster)) %>%
left_join(getCountyData(), by="countyFIPS") %>%
mutate(origCluster=cluster, cluster=ifelse(pop>=25000, cluster, "999"))
useClusters <- as.character(dfClust$cluster) %>%
purrr::set_names(dfClust$countyFIPS)
# Create new clusters
cty_newdata_20210622 <- readRunUSAFacts(maxDate="2021-06-20",
downloadTo=lapply(readList,
FUN=function(x) if(file.exists(x)) NA else x
),
readFrom=readList,
compareFile=compareList,
writeLog="./RInputFiles/Coronavirus/USAF_NewData_20210622_v005.log",
ovrwriteLog=TRUE,
useClusters=useClusters,
skipAssessmentPlots=FALSE,
brewPalette="Paired"
)
##
## No file has been downloaded, will use existing file: ./RInputFiles/Coronavirus/covid_confirmed_usafacts_downloaded_20210622.csv
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## `County Name` = col_character(),
## State = col_character(),
## StateFIPS = col_character()
## )
## i Use `spec()` for the full column specifications.
##
## *** File has been checked for uniqueness by: countyFIPS countyName state stateFIPS
##
##
## *** File has been checked for uniqueness by: countyFIPS stateFIPS date
##
##
## Checking for similarity of: column names
## In reference but not in current:
## In current but not in reference:
##
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 14
## Detailed differences available in: ./RInputFiles/Coronavirus/USAF_NewData_20210622_v005.log
##
## Checking for similarity of: county
## In reference but not in current:
## In current but not in reference:
##
##
## ***Differences of at least 5 and at least 5%
##
## 0 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20210622_v005.log
##
##
## ***Differences of at least 0 and at least 0.1%
##
## 0 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20210622_v005.log
##
##
## No file has been downloaded, will use existing file: ./RInputFiles/Coronavirus/covid_deaths_usafacts_downloaded_20210622.csv
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## `County Name` = col_character(),
## State = col_character(),
## StateFIPS = col_character()
## )
## i Use `spec()` for the full column specifications.
##
## *** File has been checked for uniqueness by: countyFIPS countyName state stateFIPS
##
##
## *** File has been checked for uniqueness by: countyFIPS stateFIPS date
##
##
## Checking for similarity of: column names
## In reference but not in current:
## In current but not in reference:
##
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 14
## Detailed differences available in: ./RInputFiles/Coronavirus/USAF_NewData_20210622_v005.log
##
## Checking for similarity of: county
## In reference but not in current:
## In current but not in reference:
##
##
## ***Differences of at least 5 and at least 5%
##
## 0 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20210622_v005.log
##
##
## ***Differences of at least 0 and at least 0.1%
##
## 0 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20210622_v005.log
##
##
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 4
## isType cases new_cases n
## <chr> <dbl> <dbl> <dbl>
## 1 before 6.66e+9 3.29e+7 1647588
## 2 after 6.63e+9 3.28e+7 1621272
## 3 pctchg 4.52e-3 3.78e-3 0.0160
##
##
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 4
## isType deaths new_deaths n
## <chr> <dbl> <dbl> <dbl>
## 1 before 1.34e+8 595239 1647588
## 2 after 1.34e+8 593224 1621272
## 3 pctchg 3.70e-3 0.00339 0.0160
## NULL
# Plot all counties based on closest cluster
sparseCountyClusterMap(cty_newdata_20210622$useClusters,
caption="Includes only counties with 25k+ population",
brewPalette="viridis"
)
saveToRDS(cty_newdata_20210622, ovrWriteError=FALSE)
##
## File already exists: ./RInputFiles/Coronavirus/cty_newdata_20210622.RDS
##
## Not replacing the existing file since ovrWrite=FALSE
## NULL
The updated county clustering routines appear to work as intended. Next steps are to update the CDC excess deaths process.
The latest data are downloaded, with existing clusters applied:
readList <- list("usafCase"="./RInputFiles/Coronavirus/covid_confirmed_usafacts_downloaded_20210813.csv",
"usafDeath"="./RInputFiles/Coronavirus/covid_deaths_usafacts_downloaded_20210813.csv"
)
compareList <- list("usafCase"=readFromRDS("cty_newdata_20210622")$dfRaw$usafCase,
"usafDeath"=readFromRDS("cty_newdata_20210622")$dfRaw$usafDeath
)
# Create new clusters
cty_newdata_20210813 <- readRunUSAFacts(maxDate="2021-08-11",
downloadTo=lapply(readList,
FUN=function(x) if(file.exists(x)) NA else x
),
readFrom=readList,
compareFile=compareList,
writeLog="./RInputFiles/Coronavirus/USAF_NewData_20210813_v005.log",
ovrwriteLog=TRUE,
useClusters=readFromRDS("cty_newdata_20210622")$useClusters,
skipAssessmentPlots=FALSE,
brewPalette="Paired"
)
##
## No file has been downloaded, will use existing file: ./RInputFiles/Coronavirus/covid_confirmed_usafacts_downloaded_20210813.csv
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## `County Name` = col_character(),
## State = col_character(),
## StateFIPS = col_character()
## )
## i Use `spec()` for the full column specifications.
##
## *** File has been checked for uniqueness by: countyFIPS countyName state stateFIPS
##
##
## *** File has been checked for uniqueness by: countyFIPS stateFIPS date
##
##
## Checking for similarity of: column names
## In reference but not in current:
## In current but not in reference:
##
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 51
## Detailed differences available in: ./RInputFiles/Coronavirus/USAF_NewData_20210813_v005.log
##
## Checking for similarity of: county
## In reference but not in current:
## In current but not in reference:
##
##
## ***Differences of at least 5 and at least 5%
##
## 0 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20210813_v005.log
##
##
## ***Differences of at least 0 and at least 0.1%
##
## 0 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20210813_v005.log
##
##
## No file has been downloaded, will use existing file: ./RInputFiles/Coronavirus/covid_deaths_usafacts_downloaded_20210813.csv
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## `County Name` = col_character(),
## State = col_character(),
## StateFIPS = col_character()
## )
## i Use `spec()` for the full column specifications.
##
## *** File has been checked for uniqueness by: countyFIPS countyName state stateFIPS
##
##
## *** File has been checked for uniqueness by: countyFIPS stateFIPS date
##
##
## Checking for similarity of: column names
## In reference but not in current:
## In current but not in reference:
##
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 51
## Detailed differences available in: ./RInputFiles/Coronavirus/USAF_NewData_20210813_v005.log
##
## Checking for similarity of: county
## In reference but not in current:
## In current but not in reference:
##
##
## ***Differences of at least 5 and at least 5%
##
## 0 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20210813_v005.log
##
##
## ***Differences of at least 0 and at least 0.1%
##
## 0 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20210813_v005.log
##
##
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 4
## isType cases new_cases n
## <chr> <dbl> <dbl> <dbl>
## 1 before 8.38e+9 3.54e+7 1810431
## 2 after 8.34e+9 3.52e+7 1781514
## 3 pctchg 4.49e-3 5.53e-3 0.0160
##
##
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 4
## isType deaths new_deaths n
## <chr> <dbl> <dbl> <dbl>
## 1 before 1.65e+8 611955 1810431
## 2 after 1.64e+8 607482 1781514
## 3 pctchg 3.93e-3 0.00731 0.0160
## NULL
# Plot all counties based on closest cluster
sparseCountyClusterMap(cty_newdata_20210813$useClusters,
caption="Includes only counties with 25k+ population",
brewPalette="viridis"
)
# Dates with large amounts of negative restatement
cty_newdata_20210813$dfPerCapita %>%
group_by(date) %>%
summarize(nNeg=sum(new_cases < 0), nPos=sum(new_cases > 0), new_cases=sum(new_cases)) %>%
arrange(-nNeg)
## # A tibble: 567 x 4
## date nNeg nPos new_cases
## <date> <int> <int> <dbl>
## 1 2021-07-09 310 1797 -399248
## 2 2020-12-28 266 2372 216179
## 3 2020-12-25 262 1350 49223
## 4 2020-10-24 256 2230 53241
## 5 2021-06-11 205 1706 -9167
## 6 2021-06-23 191 1629 15003
## 7 2021-06-18 184 1442 20155
## 8 2021-06-25 183 1545 24105
## 9 2021-04-16 174 2237 65229
## 10 2021-06-30 174 1610 10814
## # ... with 557 more rows
# Restatements taking place between July 4-14, 2021
cty_newdata_20210813$dfPerCapita %>%
group_by(date) %>%
summarize(nNeg=sum(new_cases < 0), nPos=sum(new_cases > 0), new_cases=sum(new_cases)) %>%
filter(date >= "2021-07-04", date <= "2021-07-14")
## # A tibble: 11 x 4
## date nNeg nPos new_cases
## <date> <int> <int> <dbl>
## 1 2021-07-04 8 333 2737
## 2 2021-07-05 12 395 4228
## 3 2021-07-06 57 1617 27035
## 4 2021-07-07 97 1756 21302
## 5 2021-07-08 129 1613 19276
## 6 2021-07-09 310 1797 -399248
## 7 2021-07-10 17 653 452834
## 8 2021-07-11 8 362 3435
## 9 2021-07-12 59 1596 37864
## 10 2021-07-13 89 1588 21977
## 11 2021-07-14 97 1984 32297
# Restatements taking place between December 21, 2020 and January 6, 2021
cty_newdata_20210813$dfPerCapita %>%
group_by(date) %>%
summarize(nNeg=sum(new_cases < 0), nPos=sum(new_cases > 0), new_cases=sum(new_cases)) %>%
filter(date >= "2020-12-21", date <= "2021-01-06")
## # A tibble: 17 x 4
## date nNeg nPos new_cases
## <date> <int> <int> <dbl>
## 1 2020-12-21 8 2659 362667
## 2 2020-12-22 2 2514 174503
## 3 2020-12-23 0 2567 227200
## 4 2020-12-24 3 2345 185122
## 5 2020-12-25 262 1350 49223
## 6 2020-12-26 33 1964 170262
## 7 2020-12-27 5 2018 121643
## 8 2020-12-28 266 2372 216179
## 9 2020-12-29 5 2510 229421
## 10 2020-12-30 20 2698 238917
## 11 2020-12-31 32 2406 222938
## 12 2021-01-01 4 1603 190655
## 13 2021-01-02 0 2346 283820
## 14 2021-01-03 8 2470 229336
## 15 2021-01-04 4 2484 169859
## 16 2021-01-05 0 2773 238758
## 17 2021-01-06 0 2885 256350
saveToRDS(cty_newdata_20210813, ovrWriteError=FALSE)
##
## File already exists: ./RInputFiles/Coronavirus/cty_newdata_20210813.RDS
##
## Not replacing the existing file since ovrWrite=FALSE
## NULL
There is a very large restatement of new_cases data occurring on or around July 8-11, 2021. Perhaps smoothing can be considered where each value for July 8-11, 2021 data is treated as the average across those four days. There is a similar though smaller anomaly in the December 24-29, 2020 range.
Dates are examined for those with the largest number of negative revisions in new_cases and/or new_deaths:
# All restatements
plotData <- cty_newdata_20210813$dfPerCapita %>%
select(date, new_cases, new_deaths, cpm7, dpm7) %>%
pivot_longer(-date) %>%
filter(!is.na(value)) %>%
group_by(date, name) %>%
summarize(nNeg=sum(value <= -1), sumNeg=sum(ifelse(value < 0, value, 0)), .groups="drop")
tempMapper <- c("cpm7"="Cases per million (7-day mean)",
"dpm7"="Deaths per million (7-day mean)",
"new_cases"="New cases",
"new_deaths"="New deaths"
)
plotData %>%
ggplot(aes(x=date, y=nNeg)) +
geom_line() +
facet_wrap(~tempMapper[name]) +
labs(x=NULL,
y="Number of records",
title="Counties reporting less than or equal to -1 for metric on day"
)
plotData %>%
ggplot(aes(x=date, y=sumNeg)) +
geom_line() +
facet_wrap(~tempMapper[name], scales="free_y") +
labs(x=NULL,
y="Sum of value for counties recording negative",
title="Counties reporting less than 0 for metric on day"
)
For records with negative values, the rolling-7 is calculated around the day of the negative:
# All restatements
plotData <- cty_newdata_20210813$dfPerCapita %>%
select(date, countyFIPS, new_cases, new_deaths, cpm7, dpm7) %>%
pivot_longer(-c(date, countyFIPS)) %>%
filter(!is.na(value)) %>%
group_by(countyFIPS, name) %>%
arrange(date) %>%
mutate(value7=zoo::rollmean(value, k=7, fill=NA),
isNeg=(value <= -1),
isNeg7=(is.na(value7) | value7 < 0)
) %>%
group_by(date, name) %>%
summarize(nNeg=sum(isNeg & isNeg7), sumNeg=sum(ifelse(isNeg & isNeg7, value, 0)), .groups="drop")
tempMapper <- c("cpm7"="Cases per million (7-day mean)",
"dpm7"="Deaths per million (7-day mean)",
"new_cases"="New cases",
"new_deaths"="New deaths"
)
plotData %>%
ggplot(aes(x=date, y=nNeg)) +
geom_line() +
facet_wrap(~tempMapper[name]) +
labs(x=NULL,
y="Number of records",
title="Counties reporting negative value that are also negative over the 7-day window"
)
plotData %>%
ggplot(aes(x=date, y=sumNeg)) +
geom_line() +
facet_wrap(~tempMapper[name], scales="free_y") +
labs(x=NULL,
y="Sum of value",
title="Counties reporting less than 0 for metric on day that are also negative over the 7-day window"
)
cty_newdata_20210813$dfPerCapita %>%
select(date, countyFIPS, new_cases, new_deaths, cpm7, dpm7) %>%
pivot_longer(-c(date, countyFIPS)) %>%
filter(!is.na(value)) %>%
group_by(countyFIPS, name) %>%
arrange(date) %>%
mutate(value7=zoo::rollmean(value, k=7, fill=NA),
isNeg=(value <= -1),
isNeg7=(is.na(value7) | value7 < 0)
) %>%
ungroup() %>%
group_by(name, isNeg, isNeg7) %>%
summarize(n=n(), sumValue=sum(value))
## `summarise()` has grouped output by 'name', 'isNeg'. You can override using the `.groups` argument.
## # A tibble: 16 x 5
## # Groups: name, isNeg [8]
## name isNeg isNeg7 n sumValue
## <chr> <lgl> <lgl> <int> <dbl>
## 1 cpm7 FALSE FALSE 1701869 336714571.
## 2 cpm7 FALSE TRUE 45946 3248568.
## 3 cpm7 TRUE FALSE 3859 -265077.
## 4 cpm7 TRUE TRUE 10988 -2790977.
## 5 dpm7 FALSE FALSE 1627092 6667360.
## 6 dpm7 FALSE TRUE 126669 20031.
## 7 dpm7 TRUE FALSE 678 -4144.
## 8 dpm7 TRUE TRUE 8223 -102243.
## 9 new_cases FALSE FALSE 1732888 35567937
## 10 new_cases FALSE TRUE 30335 362101
## 11 new_cases TRUE FALSE 14909 -600467
## 12 new_cases TRUE TRUE 3382 -114828
## 13 new_deaths FALSE FALSE 1749115 616693
## 14 new_deaths FALSE TRUE 27664 2807
## 15 new_deaths TRUE FALSE 3087 -7199
## 16 new_deaths TRUE TRUE 1648 -4819
Making an adjustment for situations where a metrics is negative but the 7-day total is non-negative can help to smooth some restatement anomalies. There sill still be restatements in the data, and these seem to increasein the more recent months of the data.
Next steps are to build a function to manage the smoothing, with a goal to incorporate this as part of per-capita processing.
A function is built to take an ordered vector and to replace negative values, and all of the surrounding n values, with the mean of those n values:
smoothNegativeValues <- function(x,
negTol=-0.001,
windowSize=3,
windowAlign="center",
...
) {
# FUNCTION ARGUMENTS:
# x: a vector that has been sorted such that successive observations are sequential
# negTol: tolerance for considering a value in x to be negative
# windowSize: the size for the window to use
# windowAlign: the alignment for the window (passed to zoo::rollmean)
# ...: any other arguments to pass to zoo::rollmean
# Get the rolling means for x
muRoll <- zoo::rollmean(x, k=windowSize, fill=NA, align=windowAlign, ...)
# Find the negative values and those that can be replaced
negVals <- which(x < negTol)
okFix <- which(!is.na(muRoll) & muRoll >= negTol)
negFix <- intersect(negVals, okFix)
# Replace the negative values with the rolling mean
# CAUTION: this will double over-write if negative values are very close to each other
# CAUTION: only works for centering for the time being
eachSide <- ceiling((windowSize-1)/2)
for (idxFix in negFix) x[(idxFix-eachSide):(idxFix+eachSide)] <- muRoll[idxFix]
# Return the updated vector
x
}
# Simple example, works well
smoothNegativeValues(c(1:5, 10, -2, 10, 5:1))
## [1] 1 2 3 4 5 6 6 6 5 4 3 2 1
# More complex example, double overwriting leads to incorrect vector sum
smoothNegativeValues(c(1:5, 14, -2, -2, 14, 5:1)) %>% round(1)
## [1] 1.0 2.0 3.0 4.0 5.0 3.3 3.3 3.3 3.3 5.0 4.0 3.0 2.0 1.0
# All counties with negatives
allNegCases <- cty_newdata_20210813$dfPerCapita %>%
select(countyFIPS, state, date, new_cases) %>%
group_by(countyFIPS) %>%
filter(new_cases <= -0.5) %>%
ungroup() %>%
arrange(new_cases)
allNegCases
## # A tibble: 18,291 x 4
## countyFIPS state date new_cases
## <chr> <chr> <date> <dbl>
## 1 48113 TX 2021-07-09 -43526
## 2 48439 TX 2021-07-09 -43415
## 3 48029 TX 2021-07-09 -41518
## 4 48215 TX 2021-07-09 -30342
## 5 55053 WI 2020-12-27 -21000
## 6 48121 TX 2021-07-09 -20662
## 7 48029 TX 2020-12-25 -18856
## 8 48085 TX 2021-07-09 -17026
## 9 06037 CA 2020-12-25 -12459
## 10 48355 TX 2021-07-09 -12254
## # ... with 18,281 more rows
allNegCases %>%
summarize(n=n(), new_cases=sum(new_cases))
## # A tibble: 1 x 2
## n new_cases
## <int> <dbl>
## 1 18291 -715295
# Function to check data in a key county
countyCheck <- function(df, keyDate, keyFIPS, windowSize=7) {
if (!("Date" %in% class(keyDate))) keyDate <- as.Date(keyDate)
ctyData <- df %>%
filter(countyFIPS==keyFIPS)
ctyData %>%
filter(round(new_cases) < 0) %>%
print()
ctyData <- ctyData %>%
mutate(smoothCases=smoothNegativeValues(new_cases, windowSize=windowSize))
ctyData %>%
filter(date >= keyDate - ceiling((windowSize-1)/2), date <= keyDate + ceiling((windowSize-1)/2)) %>%
select(countyFIPS, state, date, cases, new_cases, smoothCases) %>%
print()
p1 <- ctyData %>%
select(countyFIPS, state, date, new_cases, smoothCases) %>%
pivot_longer(-c(countyFIPS, state, date)) %>%
ggplot(aes(x=date, y=value)) +
geom_line(aes(group=name, color=name)) +
facet_wrap(~c("new_cases"="Raw cases", "smoothCases"="Smoothed cases")[name]) +
labs(x=NULL, y="Cases", title=paste0("Cases data for county ", keyFIPS)) +
theme(legend.position="none")
print(p1)
ctyData %>%
select(countyFIPS, state, date, new_cases, smoothCases) %>%
summarize(across(where(is.numeric), sum, na.rm=TRUE)) %>%
print()
}
# Data for countyFIPS 48113 (Texas, negative values on July 9)
countyCheck(cty_newdata_20210813$dfPerCapita, keyDate="2021-07-09", keyFIPS="48113")
## # A tibble: 1 x 15
## countyFIPS state date cases new_cases deaths new_deaths tcpm cpm
## <chr> <chr> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 48113 TX 2021-07-09 263826 -43526 4139 4 100104. -16515.
## # ... with 6 more variables: tdpm <dbl>, dpm <dbl>, tcpm7 <dbl>, cpm7 <dbl>,
## # tdpm7 <dbl>, dpm7 <dbl>
## # A tibble: 7 x 6
## countyFIPS state date cases new_cases smoothCases
## <chr> <chr> <date> <dbl> <dbl> <dbl>
## 1 48113 TX 2021-07-06 306870 0 147.
## 2 48113 TX 2021-07-07 307033 163 147.
## 3 48113 TX 2021-07-08 307352 319 147.
## 4 48113 TX 2021-07-09 263826 -43526 147.
## 5 48113 TX 2021-07-10 307901 44075 147.
## 6 48113 TX 2021-07-11 307901 0 147.
## 7 48113 TX 2021-07-12 307901 0 147.
## # A tibble: 1 x 2
## new_cases smoothCases
## <dbl> <dbl>
## 1 323669 323669
# Examples with repeating negative values
repNegCases <- cty_newdata_20210813$dfPerCapita %>%
select(countyFIPS, state, date, new_cases) %>%
group_by(countyFIPS) %>%
filter(new_cases <= -0.5, (lag(new_cases) <= -0.5 | lead(new_cases) <= -0.5)) %>%
ungroup() %>%
arrange(new_cases)
repNegCases
## # A tibble: 2,278 x 4
## countyFIPS state date new_cases
## <chr> <chr> <date> <dbl>
## 1 48471 TX 2021-07-09 -870
## 2 48325 TX 2021-06-12 -406
## 3 21235 KY 2021-06-15 -390
## 4 48025 TX 2021-07-09 -390
## 5 48185 TX 2021-07-09 -390
## 6 21235 KY 2021-06-16 -380
## 7 48201 TX 2021-06-17 -345
## 8 06025 CA 2021-06-30 -336
## 9 48069 TX 2021-07-09 -314
## 10 21235 KY 2021-06-12 -309
## # ... with 2,268 more rows
repNegCases %>%
summarize(n=n(), new_cases=sum(new_cases))
## # A tibble: 1 x 2
## n new_cases
## <int> <dbl>
## 1 2278 -16455
# Data for countyFIPS 48471 (Texas, repeating negative values on/around July 9)
countyCheck(cty_newdata_20210813$dfPerCapita, keyDate="2021-07-09", keyFIPS="48471")
## # A tibble: 52 x 15
## countyFIPS state date cases new_cases deaths new_deaths tcpm cpm
## <chr> <chr> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 48471 TX 2020-08-05 2838 -4 40 2 38892. -54.8
## 2 48471 TX 2020-08-16 3189 -1 43 0 43702. -13.7
## 3 48471 TX 2020-09-03 3417 -246 48 0 46827. -3371.
## 4 48471 TX 2020-09-04 3415 -2 48 0 46799. -27.4
## 5 48471 TX 2020-09-09 3403 -451 51 1 46635. -6181.
## 6 48471 TX 2020-09-12 3370 -46 52 0 46183. -630.
## 7 48471 TX 2020-09-17 3567 -119 53 0 48882. -1631.
## 8 48471 TX 2020-09-23 3690 -3 56 0 50568. -41.1
## 9 48471 TX 2020-09-24 3689 -1 57 1 50554. -13.7
## 10 48471 TX 2020-10-07 3788 -31 57 0 51911. -425.
## # ... with 42 more rows, and 6 more variables: tdpm <dbl>, dpm <dbl>,
## # tcpm7 <dbl>, cpm7 <dbl>, tdpm7 <dbl>, dpm7 <dbl>
## # A tibble: 7 x 6
## countyFIPS state date cases new_cases smoothCases
## <chr> <chr> <date> <dbl> <dbl> <dbl>
## 1 48471 TX 2021-07-06 8703 2 3.29
## 2 48471 TX 2021-07-07 8757 54 3.29
## 3 48471 TX 2021-07-08 8718 -39 3.29
## 4 48471 TX 2021-07-09 7848 -870 3.29
## 5 48471 TX 2021-07-10 8718 870 3.29
## 6 48471 TX 2021-07-11 8721 3 1.29
## 7 48471 TX 2021-07-12 8724 3 1.29
## # A tibble: 1 x 2
## new_cases smoothCases
## <dbl> <dbl>
## 1 9240 9393.
Curing the single negative value issue addresses over 95% of the observed issue with new_cases.
Despite the limitations, this appears promising for further exploration. The function is applied to the full per-capita dataset:
# Smoothed cases for all counties
perCapTemp <- cty_newdata_20210813$dfPerCapita %>%
select(countyFIPS, state, date, cases, new_cases) %>%
group_by(countyFIPS, state) %>%
mutate(smoothCases=smoothNegativeValues(new_cases, windowSize=7),
cumNew=cumsum(new_cases),
cumSmooth=cumsum(smoothCases),
smooth7=zoo::rollmean(smoothCases, k=7, fill=NA),
cases7=zoo::rollmean(new_cases, k=7, fill=NA)
) %>%
ungroup()
perCapTemp
## # A tibble: 1,781,514 x 10
## countyFIPS state date cases new_cases smoothCases cumNew cumSmooth
## <chr> <chr> <date> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 01001 AL 2020-01-22 0 0 0 0 0
## 2 01003 AL 2020-01-22 0 0 0 0 0
## 3 01005 AL 2020-01-22 0 0 0 0 0
## 4 01007 AL 2020-01-22 0 0 0 0 0
## 5 01009 AL 2020-01-22 0 0 0 0 0
## 6 01011 AL 2020-01-22 0 0 0 0 0
## 7 01013 AL 2020-01-22 0 0 0 0 0
## 8 01015 AL 2020-01-22 0 0 0 0 0
## 9 01017 AL 2020-01-22 0 0 0 0 0
## 10 01019 AL 2020-01-22 0 0 0 0 0
## # ... with 1,781,504 more rows, and 2 more variables: smooth7 <dbl>,
## # cases7 <dbl>
perCapTemp %>%
summarize(across(where(is.numeric), sum, na.rm=TRUE))
## # A tibble: 1 x 7
## cases new_cases smoothCases cumNew cumSmooth smooth7 cases7
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 8343470745 35214743 35220770 8343470745 8344928995. 34880628. 34874611.
plotTemp <- perCapTemp %>%
group_by(date) %>%
summarize(across(where(is.numeric), specNA(sum))) %>%
pivot_longer(-date)
# Plot #1: Cumulative cases by technique
plotTemp %>%
filter(name %in% c("cases", "cumNew", "cumSmooth")) %>%
ggplot(aes(x=date, y=value/1000000)) +
geom_line(aes(group=name,
color=c("cases"="Cases", "cumNew"="Sum new_cases", "cumSmooth"="Sum smoothed cases")[name]
)
) +
labs(x=NULL, y="Cumulative Cases (millions)", title="Cumulative case evolution by approach") +
scale_color_discrete("Technique") +
theme(legend.position="bottom")
# Plot #2: new_cases vs smoothed cases
plotTemp %>%
filter(name %in% c("new_cases", "smoothCases")) %>%
ggplot(aes(x=date, y=value/1000)) +
geom_line(aes(group=name,
color=c("new_cases"="Raw new_cases", "smoothCases"="Smoothed cases")[name]
)
) +
labs(x=NULL, y="New Cases (000s)", title="New case evolution by approach") +
scale_color_discrete("Technique") +
theme(legend.position="bottom")
# Plot #3: Rolling-7 mean for new_cases vs smoothed cases
plotTemp %>%
filter(name %in% c("cases7", "smooth7")) %>%
ggplot(aes(x=date, y=value/1000)) +
geom_line(aes(group=name,
color=c("cases7"="new_cases", "smooth7"="Smoothed cases")[name]
)
) +
labs(x=NULL, y="New Cases (000s) - rolling 7-day mean", title="New case evolution by approach") +
scale_color_discrete("Technique") +
theme(legend.position="bottom")
## Warning: Removed 12 row(s) containing missing values (geom_path).
At a national level, the smoothing significantly helps with the July negative/positive restatement, as well as providing some assistance in other time periods. Plots are made for each state using the smoothed data:
perCapTemp %>%
group_by(state, date) %>%
summarize(smooth7=specNA(sum)(smooth7), .groups="drop") %>%
ggplot(aes(x=date, y=smooth7/1000)) +
geom_line() +
labs(x=NULL, y="Cases (000s)", title="Smoothed rolling-7 mean cases per day by state") +
facet_wrap(~state, scales="free_y")
## Warning: Removed 6 row(s) containing missing values (geom_path).
Several artifacts are observed, including:
These can be further explored as a next step, with particular focus on Florida (recency) and the late 2020 pattern observed across multiple states. States are assessed for the occurrence of zero casedays after July 1, 2020:
perCapGroup <- perCapTemp %>%
mutate(epi=paste0(lubridate::epiyear(date), "-", zeroPad2(lubridate::epiweek(date)))) %>%
group_by(state, date, epi) %>%
summarize(new_cases=sum(new_cases, na.rm=TRUE), n=n(), .groups="drop")
perCapGroup
## # A tibble: 28,917 x 5
## state date epi new_cases n
## <chr> <date> <chr> <dbl> <int>
## 1 AK 2020-01-22 2020-04 0 29
## 2 AK 2020-01-23 2020-04 0 29
## 3 AK 2020-01-24 2020-04 0 29
## 4 AK 2020-01-25 2020-04 0 29
## 5 AK 2020-01-26 2020-05 0 29
## 6 AK 2020-01-27 2020-05 0 29
## 7 AK 2020-01-28 2020-05 0 29
## 8 AK 2020-01-29 2020-05 0 29
## 9 AK 2020-01-30 2020-05 0 29
## 10 AK 2020-01-31 2020-05 0 29
## # ... with 28,907 more rows
perCapGroup %>%
filter(date >= "2020-07-01", round(new_cases) <= 0) %>%
group_by(state) %>%
mutate(nDays=n()) %>%
ungroup() %>%
ggplot(aes(x=date, y=fct_reorder(state, nDays))) +
geom_tile(aes(fill=ifelse(round(new_cases)==0, "blue", "red"))) +
scale_fill_identity() +
labs(x=NULL,
y=NULL,
title="States not reporting positive cases by day",
subtitle="Blue is zero, red is negative"
)
perCapGroup %>%
filter(epi >= "2020-26", epi <= "2021-31") %>%
group_by(state, epi) %>%
summarize(nPos=sum(round(new_cases)>0), .groups="drop") %>%
group_by(state) %>%
mutate(nTot=sum(nPos)) %>%
ungroup() %>%
ggplot(aes(x=epi, y=fct_reorder(state, nTot))) +
geom_tile(aes(fill=factor(nPos, levels=0:7))) +
coord_flip() +
scale_fill_viridis_d("# days") +
labs(x=NULL,
y=NULL,
title="Number of days in epiweek with positive cases reported for state"
)
Reporting appears to be less frequent in many states in 2021. Due to holidays and other artifacts, reporting may not have a strict 7-day seasonality, thus smoothing and rolling-7 calculations may still be spiky. This is particularly prevalent around the late winter holiday (mid-December to early-January), which may drive spikes observed in late 2020.
Nebraska appears to have stopped reporting new_cases data at the county level and Rhode Island appears to have always reported on a once-per-week basis.
Particular attention should be paid to states that show a pattern by epiweek (e.g., always 5 or always 7) that then changes, especially if on a temporary basis. That appears to flag many of the states that have spiky cases, even after smoothing, in late 2020.
States and epiweeks that meet the following criteria are extracted:
Example code includes:
perCapReport <- perCapGroup %>%
group_by(state, epi) %>%
summarize(nPos=sum(round(new_cases)>0), .groups="drop") %>%
arrange(state, epi) %>%
group_by(state) %>%
mutate(dPrev=ifelse(row_number()==1, 0, abs(lag(nPos)-nPos)/pmax(1, lag(nPos)+nPos)),
dNext=ifelse(row_number()==n(), 0, abs(lead(nPos)-nPos)/pmax(1, lead(nPos)+nPos))
) %>%
ungroup()
perCapReport
## # A tibble: 4,182 x 5
## state epi nPos dPrev dNext
## <chr> <chr> <int> <dbl> <dbl>
## 1 AK 2020-04 0 0 0
## 2 AK 2020-05 0 0 0
## 3 AK 2020-06 0 0 0
## 4 AK 2020-07 0 0 0
## 5 AK 2020-08 0 0 0
## 6 AK 2020-09 0 0 0
## 7 AK 2020-10 0 0 1
## 8 AK 2020-11 1 1 0.5
## 9 AK 2020-12 3 0.5 0.4
## 10 AK 2020-13 7 0.4 0
## # ... with 4,172 more rows
perCapAnomaly <- perCapReport %>%
mutate(dMax=pmax(dPrev, dNext)) %>%
filter(epi >= "2020-26", epi <= "2021-30", dMax >= 0.3) %>%
group_by(state) %>%
mutate(n=n()) %>%
ungroup()
perCapAnomaly
## # A tibble: 123 x 7
## state epi nPos dPrev dNext dMax n
## <chr> <chr> <int> <dbl> <dbl> <dbl> <int>
## 1 AK 2020-46 7 0 0.4 0.4 5
## 2 AK 2020-47 3 0.4 0.5 0.5 5
## 3 AK 2020-48 1 0.5 0 0.5 5
## 4 AK 2020-49 1 0 0.75 0.75 5
## 5 AK 2020-50 7 0.75 0 0.75 5
## 6 AZ 2020-51 7 0 0.75 0.75 6
## 7 AZ 2020-52 1 0.75 0.714 0.75 6
## 8 AZ 2020-53 6 0.714 0.0769 0.714 6
## 9 AZ 2021-07 7 0 0.556 0.556 6
## 10 AZ 2021-08 2 0.556 0.429 0.556 6
## # ... with 113 more rows
perCapAnomaly%>%
ggplot(aes(x=fct_reorder(state, -n), y=epi)) +
geom_tile(aes(fill=factor(nPos, levels=0:7))) +
scale_fill_viridis_d("# days") +
labs(x=NULL, y=NULL, title="Potential reporting discontinuities by state and epiweek")
# Examples for KS data
perCapGroup %>%
semi_join(perCapAnomaly, by=c("state", "epi")) %>%
ggplot(aes(x=date, y=new_cases)) +
geom_point() +
labs(x=NULL, y="New cases", title="Potentially anomalous data weeks by state") +
facet_wrap(~state, scales="free")
This algorithm broadly picks out the periods where states show artifact-like spikiness on the rolling-7 case charts. Next steps are to run this process on the smoothed data, as some negative/positive spikes are addressed automatically by that methodology.
A comparison is made between the summed USAF county-level data and the CDC state-level data:
stateSmoothUSAF <- perCapTemp %>%
group_by(state, date) %>%
summarize(across(where(is.numeric), .fns=specNA(sum)), .groups="drop")
stateSmoothUSAF
## # A tibble: 28,917 x 9
## state date cases new_cases smoothCases cumNew cumSmooth smooth7 cases7
## <chr> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AK 2020-01-22 0 0 0 0 0 NA NA
## 2 AK 2020-01-23 0 0 0 0 0 NA NA
## 3 AK 2020-01-24 0 0 0 0 0 NA NA
## 4 AK 2020-01-25 0 0 0 0 0 0 0
## 5 AK 2020-01-26 0 0 0 0 0 0 0
## 6 AK 2020-01-27 0 0 0 0 0 0 0
## 7 AK 2020-01-28 0 0 0 0 0 0 0
## 8 AK 2020-01-29 0 0 0 0 0 0 0
## 9 AK 2020-01-30 0 0 0 0 0 0 0
## 10 AK 2020-01-31 0 0 0 0 0 0 0
## # ... with 28,907 more rows
stateCDC <- readFromRDS("cdc_daily_210815")$dfPerCapita %>%
select(state, date, cdcCases=new_cases) %>%
mutate(cdcCases=ifelse(is.na(cdcCases), 0, cdcCases)) %>%
arrange(state, date) %>%
group_by(state) %>%
mutate(cdcCase7=zoo::rollmean(cdcCases, k=7, fill=NA)) %>%
ungroup()
stateCDC
## # A tibble: 29,334 x 4
## state date cdcCases cdcCase7
## <chr> <date> <dbl> <dbl>
## 1 AK 2020-01-22 0 NA
## 2 AK 2020-01-23 0 NA
## 3 AK 2020-01-24 0 NA
## 4 AK 2020-01-25 0 0
## 5 AK 2020-01-26 0 0
## 6 AK 2020-01-27 0 0
## 7 AK 2020-01-28 0 0
## 8 AK 2020-01-29 0 0
## 9 AK 2020-01-30 0 0
## 10 AK 2020-01-31 0 0
## # ... with 29,324 more rows
tempPlotData <- stateSmoothUSAF %>%
select(state, date, usafSmooth=smoothCases, usafSmooth7=smooth7) %>%
full_join(stateCDC, by=c("state", "date")) %>%
arrange(date) %>%
group_by(state) %>%
mutate(cumUSAF=cumsum(ifelse(is.na(usafSmooth), 0, usafSmooth)), cumCDC=cumsum(cdcCases)) %>%
ungroup() %>%
pivot_longer(-c(state, date))
tempPlotState <- function(df, keyStates, keyVars) {
p1 <- df %>%
filter(name %in% all_of(keyVars), state %in% all_of(keyStates), !is.na(value)) %>%
ggplot(aes(x=date, y=value)) +
geom_line(aes(group=name, color=name)) +
labs(x=NULL, y="Cases", title="Comparison of cases in CDC and USAF") +
scale_color_discrete("Metric") +
facet_wrap(~state, scales="free_y")
print(p1)
}
# Examples for states that appear well-aligned
tempPlotState(tempPlotData, keyStates=c("IL", "PA"), keyVars=c("usafSmooth", "cdcCases"))
tempPlotState(tempPlotData, keyStates=c("IL", "PA"), keyVars=c("usafSmooth7", "cdcCase7"))
tempPlotState(tempPlotData, keyStates=c("IL", "PA"), keyVars=c("cumCDC", "cumUSAF"))
# Examples for states with anomalies
tempPlotState(tempPlotData, keyStates=c("MO", "FL", "TX", "NJ"), keyVars=c("usafSmooth", "cdcCases"))
tempPlotState(tempPlotData, keyStates=c("MO", "FL", "TX", "NJ"), keyVars=c("usafSmooth7", "cdcCase7"))
tempPlotState(tempPlotData, keyStates=c("MO", "FL", "TX", "NJ"), keyVars=c("cumCDC", "cumUSAF"))
# States with largest disconnects as percentage
tempPlotData %>%
filter(name %in% c("cumUSAF", "cumCDC")) %>%
pivot_wider(c(state, date)) %>%
group_by(state) %>%
summarize(absDelta=sum(abs(cumUSAF-cumCDC)), sumValues=sum(cumUSAF+cumCDC)) %>%
mutate(pct=absDelta/sumValues) %>%
ggplot(aes(x=fct_reorder(state, pct), y=pct)) +
geom_col(fill="lightblue") +
geom_text(aes(label=round(pct, 3)), hjust=0, size=3) +
labs(x=NULL,
y="Percent different in cumsum for CDC and USAF",
title="Cumulative totals in CDC vs USAF",
subtitle="Calculated as absolute value of differences divided by mean of values"
) +
coord_flip()
# Examples for states with large differences
tempPlotState(tempPlotData, keyStates=c("GA", "MI", "RI", "VT", "MA", "CA"), keyVars=c("usafSmooth7", "cdcCase7"))
tempPlotState(tempPlotData, keyStates=c("GA", "MI", "RI", "VT", "MA", "CA"), keyVars=c("cumCDC", "cumUSAF"))
In some states, there is a general trend that county-level sums (USAF) are lower than state-level totals (CDC). In these states, shape of the curve appears to be closely matched between the data sources.
For the more spiky states, it appears that cumulative totals are roughly aligned, but with differences along the way to the cumulative total.
The latest data are downloaded, with existing clusters applied:
readList <- list("usafCase"="./RInputFiles/Coronavirus/covid_confirmed_usafacts_downloaded_20210904.csv",
"usafDeath"="./RInputFiles/Coronavirus/covid_deaths_usafacts_downloaded_20210904.csv"
)
compareList <- list("usafCase"=readFromRDS("cty_newdata_20210813")$dfRaw$usafCase,
"usafDeath"=readFromRDS("cty_newdata_20210813")$dfRaw$usafDeath
)
# Create new clusters
cty_newdata_20210904 <- readRunUSAFacts(maxDate="2021-09-02",
downloadTo=lapply(readList,
FUN=function(x) if(file.exists(x)) NA else x
),
readFrom=readList,
compareFile=compareList,
writeLog="./RInputFiles/Coronavirus/USAF_NewData_20210904_v005.log",
ovrwriteLog=TRUE,
useClusters=readFromRDS("cty_newdata_20210813")$useClusters,
skipAssessmentPlots=FALSE,
brewPalette="Paired"
)
##
## No file has been downloaded, will use existing file: ./RInputFiles/Coronavirus/covid_confirmed_usafacts_downloaded_20210904.csv
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## `County Name` = col_character(),
## State = col_character(),
## StateFIPS = col_character()
## )
## i Use `spec()` for the full column specifications.
##
## *** File has been checked for uniqueness by: countyFIPS countyName state stateFIPS
##
##
## *** File has been checked for uniqueness by: countyFIPS stateFIPS date
##
##
## Checking for similarity of: column names
## In reference but not in current:
## In current but not in reference:
##
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 6
## Detailed differences available in: ./RInputFiles/Coronavirus/USAF_NewData_20210904_v005.log
##
## Checking for similarity of: county
## In reference but not in current:
## In current but not in reference:
##
##
## ***Differences of at least 5 and at least 5%
##
## 2 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20210904_v005.log
##
##
## ***Differences of at least 0 and at least 0.1%
##
## 88 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20210904_v005.log
##
##
## No file has been downloaded, will use existing file: ./RInputFiles/Coronavirus/covid_deaths_usafacts_downloaded_20210904.csv
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## `County Name` = col_character(),
## State = col_character(),
## StateFIPS = col_character()
## )
## i Use `spec()` for the full column specifications.
##
## *** File has been checked for uniqueness by: countyFIPS countyName state stateFIPS
##
##
## *** File has been checked for uniqueness by: countyFIPS stateFIPS date
##
##
## Checking for similarity of: column names
## In reference but not in current:
## In current but not in reference:
##
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 6
## Detailed differences available in: ./RInputFiles/Coronavirus/USAF_NewData_20210904_v005.log
##
## Checking for similarity of: county
## In reference but not in current:
## In current but not in reference:
##
##
## ***Differences of at least 5 and at least 5%
##
## 2 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20210904_v005.log
##
##
## ***Differences of at least 0 and at least 0.1%
##
## 37 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20210904_v005.log
##
##
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 4
## isType cases new_cases n
## <chr> <dbl> <dbl> <dbl>
## 1 before 8.60e+9 3.62e+7 1829589
## 2 after 8.56e+9 3.60e+7 1800366
## 3 pctchg 4.51e-3 4.05e-3 0.0160
##
##
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 4
## isType deaths new_deaths n
## <chr> <dbl> <dbl> <dbl>
## 1 before 1.68e+8 615386 1829589
## 2 after 1.68e+8 610247 1800366
## 3 pctchg 4.03e-3 0.00835 0.0160
## NULL
# Plot all counties based on closest cluster
sparseCountyClusterMap(cty_newdata_20210904$useClusters,
caption="Includes only counties with 25k+ population",
brewPalette="viridis"
)
# Dates with large amounts of negative restatement
cty_newdata_20210904$dfPerCapita %>%
group_by(date) %>%
summarize(nNeg=sum(new_cases < 0), nPos=sum(new_cases > 0), new_cases=sum(new_cases)) %>%
arrange(-nNeg)
## # A tibble: 573 x 4
## date nNeg nPos new_cases
## <date> <int> <int> <dbl>
## 1 2021-07-09 310 1797 -399248
## 2 2020-12-28 266 2372 216179
## 3 2020-12-25 262 1350 49223
## 4 2020-10-24 256 2230 53241
## 5 2021-06-11 205 1706 -9167
## 6 2021-06-23 191 1629 15003
## 7 2021-06-18 184 1442 20155
## 8 2021-06-25 183 1545 24105
## 9 2021-04-16 174 2237 65229
## 10 2021-06-30 174 1610 10814
## # ... with 563 more rows
# Restatements taking place between July 4-14, 2021
cty_newdata_20210904$dfPerCapita %>%
group_by(date) %>%
summarize(nNeg=sum(new_cases < 0), nPos=sum(new_cases > 0), new_cases=sum(new_cases)) %>%
filter(date >= "2021-07-04", date <= "2021-07-14")
## # A tibble: 11 x 4
## date nNeg nPos new_cases
## <date> <int> <int> <dbl>
## 1 2021-07-04 8 333 2737
## 2 2021-07-05 12 395 4228
## 3 2021-07-06 57 1617 27035
## 4 2021-07-07 97 1756 21302
## 5 2021-07-08 129 1613 19276
## 6 2021-07-09 310 1797 -399248
## 7 2021-07-10 17 653 452834
## 8 2021-07-11 8 362 3435
## 9 2021-07-12 59 1596 37864
## 10 2021-07-13 92 1603 23919
## 11 2021-07-14 97 1983 32194
# Restatements taking place between December 21, 2020 and January 6, 2021
cty_newdata_20210904$dfPerCapita %>%
group_by(date) %>%
summarize(nNeg=sum(new_cases < 0), nPos=sum(new_cases > 0), new_cases=sum(new_cases)) %>%
filter(date >= "2020-12-21", date <= "2021-01-06")
## # A tibble: 17 x 4
## date nNeg nPos new_cases
## <date> <int> <int> <dbl>
## 1 2020-12-21 8 2659 362667
## 2 2020-12-22 2 2514 174503
## 3 2020-12-23 0 2567 227200
## 4 2020-12-24 3 2345 185122
## 5 2020-12-25 262 1350 49223
## 6 2020-12-26 33 1964 170262
## 7 2020-12-27 5 2018 121643
## 8 2020-12-28 266 2372 216179
## 9 2020-12-29 5 2510 229421
## 10 2020-12-30 20 2698 238917
## 11 2020-12-31 32 2406 222938
## 12 2021-01-01 4 1603 190655
## 13 2021-01-02 0 2346 283820
## 14 2021-01-03 8 2470 229336
## 15 2021-01-04 4 2484 169859
## 16 2021-01-05 0 2773 238758
## 17 2021-01-06 0 2885 256350
saveToRDS(cty_newdata_20210904, ovrWriteError=FALSE)
##
## File already exists: ./RInputFiles/Coronavirus/cty_newdata_20210904.RDS
##
## Not replacing the existing file since ovrWrite=FALSE
## NULL
The update appears to be missing data after August 16, 2021. Further exploration is needed. It appears that the data have moved to a different link. The urlMapper is updated and the process re-run:
urlMapper[["usafCase"]] <- "https://static.usafacts.org/public/data/covid-19/covid_confirmed_usafacts.csv"
urlMapper[["usafDeath"]] <- "https://static.usafacts.org/public/data/covid-19/covid_deaths_usafacts.csv"
urlMapper[["usafPop"]] <- "https://static.usafacts.org/public/data/covid-19/covid_county_population_usafacts.csv"
readList <- list("usafCase"="./RInputFiles/Coronavirus/covid_confirmed_usafacts_downloaded_20210905.csv",
"usafDeath"="./RInputFiles/Coronavirus/covid_deaths_usafacts_downloaded_20210905.csv"
)
compareList <- list("usafCase"=readFromRDS("cty_newdata_20210813")$dfRaw$usafCase,
"usafDeath"=readFromRDS("cty_newdata_20210813")$dfRaw$usafDeath
)
# Create new clusters
cty_newdata_20210905 <- readRunUSAFacts(maxDate="2021-09-03",
downloadTo=lapply(readList,
FUN=function(x) if(file.exists(x)) NA else x
),
readFrom=readList,
compareFile=compareList,
writeLog="./RInputFiles/Coronavirus/USAF_NewData_20210905_v005.log",
ovrwriteLog=TRUE,
useClusters=readFromRDS("cty_newdata_20210813")$useClusters,
skipAssessmentPlots=FALSE,
brewPalette="Paired"
)
##
## No file has been downloaded, will use existing file: ./RInputFiles/Coronavirus/covid_confirmed_usafacts_downloaded_20210905.csv
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## `County Name` = col_character(),
## State = col_character(),
## StateFIPS = col_character()
## )
## i Use `spec()` for the full column specifications.
##
## *** File has been checked for uniqueness by: countyFIPS countyName state stateFIPS
##
##
## *** File has been checked for uniqueness by: countyFIPS stateFIPS date
##
##
## Checking for similarity of: column names
## In reference but not in current:
## In current but not in reference:
##
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 24
## Detailed differences available in: ./RInputFiles/Coronavirus/USAF_NewData_20210905_v005.log
##
## Checking for similarity of: county
## In reference but not in current:
## In current but not in reference:
##
##
## ***Differences of at least 5 and at least 5%
##
## 17 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20210905_v005.log
##
##
## ***Differences of at least 0 and at least 0.1%
##
## 371 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20210905_v005.log
##
##
## No file has been downloaded, will use existing file: ./RInputFiles/Coronavirus/covid_deaths_usafacts_downloaded_20210905.csv
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## `County Name` = col_character(),
## State = col_character(),
## StateFIPS = col_character()
## )
## i Use `spec()` for the full column specifications.
##
## *** File has been checked for uniqueness by: countyFIPS countyName state stateFIPS
##
##
## *** File has been checked for uniqueness by: countyFIPS stateFIPS date
##
##
## Checking for similarity of: column names
## In reference but not in current:
## In current but not in reference:
##
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 24
## Detailed differences available in: ./RInputFiles/Coronavirus/USAF_NewData_20210905_v005.log
##
## Checking for similarity of: county
## In reference but not in current:
## In current but not in reference:
##
##
## ***Differences of at least 5 and at least 5%
##
## 16 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20210905_v005.log
##
##
## ***Differences of at least 0 and at least 0.1%
##
## 216 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20210905_v005.log
##
##
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 4
## isType cases new_cases n
## <chr> <dbl> <dbl> <dbl>
## 1 before 9.28e+9 3.90e+7 1887063
## 2 after 9.23e+9 3.89e+7 1856922
## 3 pctchg 4.70e-3 4.54e-3 0.0160
##
##
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 4
## isType deaths new_deaths n
## <chr> <dbl> <dbl> <dbl>
## 1 before 1.80e+8 641157 1887063
## 2 after 1.79e+8 630136 1856922
## 3 pctchg 4.80e-3 0.0172 0.0160
## NULL
# Plot all counties based on closest cluster
sparseCountyClusterMap(cty_newdata_20210905$useClusters,
caption="Includes only counties with 25k+ population",
brewPalette="viridis"
)
# Dates with large amounts of negative restatement
cty_newdata_20210905$dfPerCapita %>%
group_by(date) %>%
summarize(nNeg=sum(new_cases < 0), nPos=sum(new_cases > 0), new_cases=sum(new_cases)) %>%
arrange(-nNeg)
## # A tibble: 591 x 4
## date nNeg nPos new_cases
## <date> <int> <int> <dbl>
## 1 2021-07-09 310 1797 -399248
## 2 2020-12-28 266 2372 216179
## 3 2020-12-25 262 1350 49223
## 4 2020-10-24 256 2230 53241
## 5 2021-06-11 205 1706 -9167
## 6 2021-06-23 191 1629 15003
## 7 2021-06-18 184 1442 20155
## 8 2021-06-25 183 1545 24105
## 9 2021-04-16 174 2237 65229
## 10 2021-06-30 174 1610 10814
## # ... with 581 more rows
# Restatements taking place between July 4-14, 2021
cty_newdata_20210905$dfPerCapita %>%
group_by(date) %>%
summarize(nNeg=sum(new_cases < 0), nPos=sum(new_cases > 0), new_cases=sum(new_cases)) %>%
filter(date >= "2021-07-04", date <= "2021-07-14")
## # A tibble: 11 x 4
## date nNeg nPos new_cases
## <date> <int> <int> <dbl>
## 1 2021-07-04 8 333 2737
## 2 2021-07-05 12 395 4228
## 3 2021-07-06 57 1617 27035
## 4 2021-07-07 97 1756 21302
## 5 2021-07-08 129 1613 19276
## 6 2021-07-09 310 1797 -399248
## 7 2021-07-10 17 653 452834
## 8 2021-07-11 8 362 3435
## 9 2021-07-12 59 1596 37864
## 10 2021-07-13 92 1603 23919
## 11 2021-07-14 97 1983 32194
# Restatements taking place between December 21, 2020 and January 6, 2021
cty_newdata_20210905$dfPerCapita %>%
group_by(date) %>%
summarize(nNeg=sum(new_cases < 0), nPos=sum(new_cases > 0), new_cases=sum(new_cases)) %>%
filter(date >= "2020-12-21", date <= "2021-01-06")
## # A tibble: 17 x 4
## date nNeg nPos new_cases
## <date> <int> <int> <dbl>
## 1 2020-12-21 8 2659 362667
## 2 2020-12-22 2 2514 174503
## 3 2020-12-23 0 2567 227200
## 4 2020-12-24 3 2345 185122
## 5 2020-12-25 262 1350 49223
## 6 2020-12-26 33 1964 170262
## 7 2020-12-27 5 2018 121643
## 8 2020-12-28 266 2372 216179
## 9 2020-12-29 5 2510 229421
## 10 2020-12-30 20 2698 238917
## 11 2020-12-31 32 2406 222938
## 12 2021-01-01 4 1603 190655
## 13 2021-01-02 0 2346 283820
## 14 2021-01-03 8 2470 229336
## 15 2021-01-04 4 2484 169859
## 16 2021-01-05 0 2773 238758
## 17 2021-01-06 0 2885 256350
saveToRDS(cty_newdata_20210905, ovrWriteError=FALSE)
##
## File already exists: ./RInputFiles/Coronavirus/cty_newdata_20210905.RDS
##
## Not replacing the existing file since ovrWrite=FALSE
## NULL
Data are updated as expected. As in previous downloads, there are some large restatements that take place in a few key weeks and states.
Plots are created for total deaths per capita by county:
# Create data for total burden and change in burden over past 28 days
burdenSummary <- cty_newdata_20210905$dfPerCapita %>%
group_by(countyFIPS, state) %>%
summarize(asofDate=max(date),
tdpm=sum(ifelse(date==max(date), tdpm, 0)),
tcpm=sum(ifelse(date==max(date), tcpm, 0)),
dpm28=sum(ifelse(date>max(date)-28, dpm, 0)),
cpm28=sum(ifelse(date>max(date)-28, cpm, 0)),
.groups="drop"
)
burdenSummary
## # A tibble: 3,142 x 7
## countyFIPS state asofDate tdpm tcpm dpm28 cpm28
## <chr> <chr> <date> <dbl> <dbl> <dbl> <dbl>
## 1 01001 AL 2021-09-03 2130. 159462. 89.5 21747.
## 2 01003 AL 2021-09-03 1756. 151361. 278. 31209.
## 3 01005 AL 2021-09-03 2633. 127279. 81.0 22563.
## 4 01007 AL 2021-09-03 3304. 159864. 357. 27105.
## 5 01009 AL 2021-09-03 2525. 151541. 104. 21115.
## 6 01011 AL 2021-09-03 4158. 138501. 0 13068.
## 7 01013 AL 2021-09-03 3908. 147367. 206. 20362.
## 8 01015 AL 2021-09-03 3125. 161172. 176. 23221.
## 9 01017 AL 2021-09-03 3819. 145546. 60.1 23516.
## 10 01019 AL 2021-09-03 1871. 96541. 38.2 18705.
## # ... with 3,132 more rows
# Create histograms for burden metrics
burdenSummary %>%
pivot_longer(-c(countyFIPS, state, asofDate)) %>%
ggplot(aes(x=value)) +
geom_histogram() +
facet_wrap(~name, scales="free_x")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Negative values in the data (likely due to restatement)
burdenSummary %>%
pivot_longer(-c(countyFIPS, state, asofDate)) %>%
filter(value < 0)
## # A tibble: 16 x 5
## countyFIPS state asofDate name value
## <chr> <chr> <date> <chr> <dbl>
## 1 02013 AK 2021-09-03 dpm28 -899.
## 2 02013 AK 2021-09-03 cpm28 -83908.
## 3 02016 AK 2021-09-03 cpm28 -93184.
## 4 02240 AK 2021-09-03 cpm28 -1596.
## 5 02261 AK 2021-09-03 cpm28 -28472.
## 6 08005 CO 2021-09-03 dpm28 -9.14
## 7 08055 CO 2021-09-03 dpm28 -145.
## 8 13257 GA 2021-09-03 dpm28 -38.6
## 9 17011 IL 2021-09-03 dpm28 -30.6
## 10 20081 KS 2021-09-03 dpm28 -252.
## 11 20203 KS 2021-09-03 dpm28 -472.
## 12 26033 MI 2021-09-03 dpm28 -26.8
## 13 37065 NC 2021-09-03 dpm28 -19.4
## 14 37127 NC 2021-09-03 dpm28 -10.6
## 15 41045 OR 2021-09-03 dpm28 -32.7
## 16 44005 RI 2021-09-03 dpm28 -122.
# Create histograms for burden metrics, with all values made no less than 0
metricLabels <- c("cpm28"="Cases per million (most recent 28 days)",
"dpm28"="Deaths per million (most recent 28 days)",
"tcpm"="Cases per million (aggregate)",
"tdpm"="Deaths per million (aggregate)"
)
burdenSummary %>%
pivot_longer(-c(countyFIPS, state, asofDate)) %>%
ggplot(aes(x=pmax(0, value))) +
geom_histogram(fill="lightblue", bins=50) +
facet_wrap(~metricLabels[name], scales="free") +
labs(title="Coronavirus burden by county as of Sep 3, 20201",
caption="Data from USA Facts",
x="Burden",
y="# Counties"
)
# Create plot of cases (past 28 days)
burdenSummary %>%
select(fips=countyFIPS, cpm28) %>%
mutate(cpm28=pmin(pmax(cpm28, 0), 25000)/1000) %>%
usmap::plot_usmap(regions="counties", exclude="NE", data=., values="cpm28") +
scale_fill_continuous("CPM 000s\n(28 days)", low="grey", high="red") +
labs(title="Cases per million (past 28 days) by county as of Sep 3, 2021",
subtitle="Capped at 25k",
caption="Source: USA Facts\nNebraska excluded due to unreliable data"
)
# Create plot of deaths (past 28 days)
burdenSummary %>%
select(fips=countyFIPS, dpm28) %>%
mutate(dpm28=pmin(pmax(dpm28, 0), 250)) %>%
usmap::plot_usmap(regions="counties", exclude=c("NE", "FL"), data=., values="dpm28") +
scale_fill_continuous("DPM 000s\n(28 days)", low="grey", high="red") +
labs(title="Deaths per million (past 28 days) by county as of Sep 3, 2021",
subtitle="Capped at 250",
caption="Source: USA Facts\nNebraska and Florida excluded due to unreliable data"
)
Further exploration of Nebraska and Florida is merited:
# Attache county name and population data
burdenPop <- burdenSummary %>%
left_join(getCountyData(), by=c("countyFIPS", "state"))
burdenPop
## # A tibble: 3,142 x 9
## countyFIPS state asofDate tdpm tcpm dpm28 cpm28 countyName pop
## <chr> <chr> <date> <dbl> <dbl> <dbl> <dbl> <chr> <dbl>
## 1 01001 AL 2021-09-03 2130. 159462. 89.5 21747. Autauga County 55869
## 2 01003 AL 2021-09-03 1756. 151361. 278. 31209. Baldwin County 223234
## 3 01005 AL 2021-09-03 2633. 127279. 81.0 22563. Barbour County 24686
## 4 01007 AL 2021-09-03 3304. 159864. 357. 27105. Bibb County 22394
## 5 01009 AL 2021-09-03 2525. 151541. 104. 21115. Blount County 57826
## 6 01011 AL 2021-09-03 4158. 138501. 0 13068. Bullock County 10101
## 7 01013 AL 2021-09-03 3908. 147367. 206. 20362. Butler County 19448
## 8 01015 AL 2021-09-03 3125. 161172. 176. 23221. Calhoun County 113605
## 9 01017 AL 2021-09-03 3819. 145546. 60.1 23516. Chambers County 33254
## 10 01019 AL 2021-09-03 1871. 96541. 38.2 18705. Cherokee County 26196
## # ... with 3,132 more rows
# Counties of at least 25k with non-positive cases and/or deaths in the past 28 days
burdenZero <- burdenPop %>%
pivot_longer(c(tdpm, tcpm, dpm28, cpm28)) %>%
filter(name %in% c("dpm28", "cpm28"), value <= 0) %>%
mutate(is25k=ifelse(pop >= 25000, "Y", "N"))
burdenZero %>%
group_by(state, name) %>%
summarize(n=n(), pop=sum(pop), value=sum(value), n25k=sum(is25k=="Y"),.groups="drop") %>%
arrange(-pop)
## # A tibble: 53 x 6
## state name n pop value n25k
## <chr> <chr> <int> <dbl> <dbl> <int>
## 1 FL dpm28 67 21477737 0 53
## 2 NJ dpm28 21 8882190 0 21
## 3 NE cpm28 93 1934408 0 12
## 4 NE dpm28 93 1934408 0 12
## 5 CO dpm28 20 890924 -154. 4
## 6 VA dpm28 44 889505 0 10
## 7 MN dpm28 45 771080 0 9
## 8 IA dpm28 49 749669 0 5
## 9 CA dpm28 14 631858 0 6
## 10 WI dpm28 24 597268 0 10
## # ... with 43 more rows
# Create the baseline map
p1 <- burdenZero %>%
pivot_wider() %>%
mutate(noCPM=(!is.na(cpm28) & is.na(dpm28)),
noDPM=(!is.na(dpm28) & is.na(cpm28)),
noBoth=(!is.na(cpm28) & !is.na(dpm28)),
ctyType=factor(case_when(is25k=="N" ~ "<25k",
noBoth ~ "Both",
noDPM ~ "Death",
noCPM ~ "Cases",
TRUE ~ "Logic Error"
)
)
) %>%
select(fips=countyFIPS, ctyType) %>%
usmap::plot_usmap(regions="counties", values="ctyType", data=.)
p1
# Map with state overlays and legend
p2 <- ggplot() +
geom_polygon(data=p1[[1]],
aes(x=x, y=y, group=group, fill=p1[[1]]$ctyType),
color = NA,
size = 0.1
) +
geom_polygon(data=usmap::plot_usmap("states")[[1]],
aes(x=x, y=y, group=group),
color = "black",
lwd=1.25,
fill = alpha(0.001)
) +
coord_equal() +
ggthemes::theme_map() +
labs(title="Counties reporting non-positive cases and/or deaths in the past 28 days",
subtitle="County-level data as of Sep 3, 2021",
caption="Source: USA Facts"
) +
scale_fill_brewer("Missing\nMetric", na.value="white", palette="Set2")
p2
Missing data are common in Florida (deaths), New Jersey (deaths), and Nebraska (cases and deaths). Smaller counties may not have any cases or (especially) deaths in a month, and all other states some counties reporting burdens in the past 28 days.
Data for large counties in Florida, Nebraska, and New Jersey are explored:
set.seed(2109081404)
exploreCounties <- getCountyData() %>%
filter(state %in% c("FL", "NE", "NJ"), pop >= 100000) %>%
group_by(state) %>%
sample_n(3) %>%
ungroup()
exploreCounties
## # A tibble: 9 x 4
## countyFIPS countyName state pop
## <chr> <chr> <chr> <dbl>
## 1 12053 Hernando County FL 193920
## 2 12091 Okaloosa County FL 210738
## 3 12097 Osceola County FL 375751
## 4 31153 Sarpy County NE 187196
## 5 31055 Douglas County NE 571327
## 6 31109 Lancaster County NE 319090
## 7 34019 Hunterdon County NJ 124371
## 8 34005 Burlington County NJ 445349
## 9 34037 Sussex County NJ 140488
cty_newdata_20210905$dfPerCapita %>%
right_join(exploreCounties, by=c("countyFIPS", "state")) %>%
ggplot(aes(x=date, y=cases/1000)) +
geom_line(aes(group=countyFIPS, color=countyName)) +
facet_wrap(~state, scales="free_y") +
labs(title="Total cases for select counties in FL, NE, NJ",
x=NULL,
y="Total cases (000s)",
subtitle="Dashed line is 28 days prior to end of data"
) +
geom_vline(xintercept=as.Date("2021-08-07"), lty=2) +
scale_color_discrete("County")
cty_newdata_20210905$dfPerCapita %>%
right_join(exploreCounties, by=c("countyFIPS", "state")) %>%
ggplot(aes(x=date, y=deaths)) +
geom_line(aes(group=countyFIPS, color=countyName)) +
facet_wrap(~state, scales="free_y") +
labs(title="Total deaths for select counties in FL, NE, NJ",
x=NULL,
y="Total deaths (000s)",
subtitle="Dashed line is 28 days prior to end of data"
) +
geom_vline(xintercept=as.Date("2021-08-07"), lty=2) +
scale_color_discrete("County")
Reporting for NE (both) and FL (deaths) for select counties has a clear break point. The deaths in NJ were close to flat-lining already, so the plots do not appear off-trend. However, it would be very surprising for a large county to have zero coronavirus deaths in a month.
The relationship to the Unallocated bucket by state is also explored:
# Combination of cases and deaths
temp_casedeath <- cty_newdata_20210905$dfRaw$usafCase %>%
full_join(cty_newdata_20210905$dfRaw$usafDeath)
## Joining, by = c("countyFIPS", "countyName", "state", "stateFIPS", "date")
# Exploration of cases and deaths
temp_flnenj <- temp_casedeath %>%
filter(state %in% c("NJ", "NE", "FL")) %>%
mutate(ym=customYYYYMM(date)) %>%
group_by(countyFIPS, countyName, state, ym) %>%
summarize(across(c("new_cases", "new_deaths"), sum, na.rm=TRUE),
across(c("cases", "deaths"), last),
.groups="drop"
) %>%
pivot_longer(-c(countyFIPS, countyName, state, ym)) %>%
group_by(state, ym, name) %>%
summarize(ua=sum(ifelse(countyFIPS=="00000", value, 0)), full=sum(value), .groups="drop")
temp_flnenj
## # A tibble: 252 x 5
## state ym name ua full
## <chr> <chr> <chr> <dbl> <dbl>
## 1 FL 2020-01 cases 0 0
## 2 FL 2020-01 deaths 0 0
## 3 FL 2020-01 new_cases 0 0
## 4 FL 2020-01 new_deaths 0 0
## 5 FL 2020-02 cases 0 0
## 6 FL 2020-02 deaths 0 0
## 7 FL 2020-02 new_cases 0 0
## 8 FL 2020-02 new_deaths 0 0
## 9 FL 2020-03 cases 2 6741
## 10 FL 2020-03 deaths 0 85
## # ... with 242 more rows
for (keyState in c("FL", "NE", "NJ")) {
p1 <- temp_flnenj %>%
filter(state %in% all_of(keyState), ym < "2021-09") %>%
ggplot(aes(x=lubridate::ym(ym))) + geom_col(aes(y=full), fill="lightblue") +
geom_line(aes(y=ua), lty=2) +
facet_wrap(~name, scales="free_y") +
geom_point(aes(y=ua)) +
labs(title=paste0("Total burden and unallocated burden by state and month for: ", keyState),
x="Month",
y="Burden",
subtitle="Dashed line and dots are unallocated, bars are sum"
)
print(p1)
}
The states of FL, NE, NJ appear to assign large amounts of recent data to “unallocated”. The proportion of unallocated by month is also explored:
# Unallocated by state data
temp_ua <- temp_casedeath %>%
mutate(ua=ifelse(countyFIPS=="00000", "Unallocated", "Allocated")) %>%
group_by(state, date, ua) %>%
summarize(across(where(is.numeric), sum), .groups="drop") %>%
mutate(ym=customYYYYMM(date)) %>%
group_by(state, ua, ym) %>%
summarize(across(c("new_cases", "new_deaths"), sum, na.rm=TRUE),
across(c("cases", "deaths"), last),
.groups="drop"
) %>%
pivot_longer(-c(state, ua, ym))
temp_ua
## # A tibble: 8,568 x 5
## state ua ym name value
## <chr> <chr> <chr> <chr> <dbl>
## 1 AK Allocated 2020-01 new_cases 0
## 2 AK Allocated 2020-01 new_deaths 0
## 3 AK Allocated 2020-01 cases 0
## 4 AK Allocated 2020-01 deaths 0
## 5 AK Allocated 2020-02 new_cases 0
## 6 AK Allocated 2020-02 new_deaths 0
## 7 AK Allocated 2020-02 cases 0
## 8 AK Allocated 2020-02 deaths 0
## 9 AK Allocated 2020-03 new_cases 133
## 10 AK Allocated 2020-03 new_deaths 2
## # ... with 8,558 more rows
# States ordered by proportion of unallocated deaths
tempStateSort <- temp_ua %>%
pivot_wider(c(state, ym, name), names_from="ua") %>%
filter(name=="deaths", ym=="2021-08") %>%
mutate(pct=Unallocated/Allocated) %>%
arrange(-pct) %>%
pull(state)
# Exploration by metric
for (keyMetric in unique(temp_ua$name)) {
p1 <- temp_ua %>%
filter(name %in% all_of(keyMetric), ym < "2021-09", ym > "2020-02") %>%
ggplot(aes(x=lubridate::ym(ym), y=value)) +
geom_col(aes(fill=ua), position="fill") +
theme(axis.text.x=element_text(angle=90)) +
labs(x=NULL,
y="Proportion of data",
title=paste0("Proportion unallocated by state on metric: ", keyMetric),
subtitle="States sorted from highest percentage of cumulative unallocated deaths to lowest"
) +
facet_wrap(~factor(state, levels=tempStateSort)) +
scale_fill_discrete("Type")
print(p1)
}
## Warning: Removed 8 rows containing missing values (geom_col).
## Warning: Removed 2 rows containing missing values (geom_col).
The unallocated bucket is used differently by state, and negatives suggest a process of moving unallocated data to allocated at a future date. The biggest recent change in proportion unallocated are in FL and NE.
Summaries are refreshed with the latest data:
urlMapper[["usafCase"]] <- "https://static.usafacts.org/public/data/covid-19/covid_confirmed_usafacts.csv"
urlMapper[["usafDeath"]] <- "https://static.usafacts.org/public/data/covid-19/covid_deaths_usafacts.csv"
urlMapper[["usafPop"]] <- "https://static.usafacts.org/public/data/covid-19/covid_county_population_usafacts.csv"
readList <- list("usafCase"="./RInputFiles/Coronavirus/covid_confirmed_usafacts_downloaded_20211208.csv",
"usafDeath"="./RInputFiles/Coronavirus/covid_deaths_usafacts_downloaded_20211208.csv"
)
compareList <- list("usafCase"=readFromRDS("cty_newdata_20210905")$dfRaw$usafCase,
"usafDeath"=readFromRDS("cty_newdata_20210905")$dfRaw$usafDeath
)
# Create new clusters
cty_newdata_20211208 <- readRunUSAFacts(maxDate="2021-12-06",
downloadTo=lapply(readList,
FUN=function(x) if(file.exists(x)) NA else x
),
readFrom=readList,
compareFile=compareList,
writeLog="./RInputFiles/Coronavirus/USAF_NewData_20211208_v005.log",
ovrwriteLog=TRUE,
useClusters=readFromRDS("cty_newdata_20210813")$useClusters,
skipAssessmentPlots=FALSE,
brewPalette="Paired"
)
##
## No file has been downloaded, will use existing file: ./RInputFiles/Coronavirus/covid_confirmed_usafacts_downloaded_20211208.csv
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## `County Name` = col_character(),
## State = col_character(),
## StateFIPS = col_character()
## )
## i Use `spec()` for the full column specifications.
##
## *** File has been checked for uniqueness by: countyFIPS countyName state stateFIPS
##
##
## *** File has been checked for uniqueness by: countyFIPS stateFIPS date
##
##
## Checking for similarity of: column names
## In reference but not in current:
## In current but not in reference:
##
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 94
## Detailed differences available in: ./RInputFiles/Coronavirus/USAF_NewData_20211208_v005.log
##
## Checking for similarity of: county
## In reference but not in current:
## In current but not in reference:
##
##
## ***Differences of at least 5 and at least 5%
##
## 318 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20211208_v005.log
##
##
## ***Differences of at least 0 and at least 0.1%
##
## 600 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20211208_v005.log
##
##
## No file has been downloaded, will use existing file: ./RInputFiles/Coronavirus/covid_deaths_usafacts_downloaded_20211208.csv
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## `County Name` = col_character(),
## State = col_character(),
## StateFIPS = col_character()
## )
## i Use `spec()` for the full column specifications.
##
## *** File has been checked for uniqueness by: countyFIPS countyName state stateFIPS
##
##
## *** File has been checked for uniqueness by: countyFIPS stateFIPS date
##
##
## Checking for similarity of: column names
## In reference but not in current:
## In current but not in reference:
##
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 94
## Detailed differences available in: ./RInputFiles/Coronavirus/USAF_NewData_20211208_v005.log
##
## Checking for similarity of: county
## In reference but not in current:
## In current but not in reference:
##
##
## ***Differences of at least 5 and at least 5%
##
## 381 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20211208_v005.log
##
##
## ***Differences of at least 0 and at least 0.1%
##
## 895 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20211208_v005.log
##
##
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 4
## isType cases new_cases n
## <chr> <dbl> <dbl> <dbl>
## 1 before 1.35e+10 4.82e+7 2187205
## 2 after 1.34e+10 4.80e+7 2152270
## 3 pctchg 4.62e- 3 3.97e-3 0.0160
##
##
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 4
## isType deaths new_deaths n
## <chr> <dbl> <dbl> <dbl>
## 1 before 2.51e+8 781775 2187205
## 2 after 2.47e+8 752555 2152270
## 3 pctchg 1.42e-2 0.0374 0.0160
## NULL
# Plot all counties based on closest cluster
sparseCountyClusterMap(cty_newdata_20211208$useClusters,
caption="Includes only counties with 25k+ population",
brewPalette="viridis"
)
saveToRDS(cty_newdata_20211208, ovrWriteError=FALSE)
##
## File already exists: ./RInputFiles/Coronavirus/cty_newdata_20211208.RDS
##
## Not replacing the existing file since ovrWrite=FALSE
## NULL
Plots are updated for total deaths per capita by county:
# Create data for total burden and change in burden over past 28 days
burdenSummary <- cty_newdata_20211208$dfPerCapita %>%
group_by(countyFIPS, state) %>%
summarize(asofDate=max(date),
tdpm=sum(ifelse(date==max(date), tdpm, 0)),
tcpm=sum(ifelse(date==max(date), tcpm, 0)),
dpm28=sum(ifelse(date>max(date)-28, dpm, 0)),
cpm28=sum(ifelse(date>max(date)-28, cpm, 0)),
.groups="drop"
)
burdenSummary
## # A tibble: 3,142 x 7
## countyFIPS state asofDate tdpm tcpm dpm28 cpm28
## <chr> <chr> <date> <dbl> <dbl> <dbl> <dbl>
## 1 01001 AL 2021-12-06 2810. 189049. 107. 4063.
## 2 01003 AL 2021-12-06 2638. 171188. 62.7 2491.
## 3 01005 AL 2021-12-06 3241. 150207. 40.5 3808.
## 4 01007 AL 2021-12-06 4242. 194874. 134. 3081.
## 5 01009 AL 2021-12-06 3338. 186612. 121. 4444.
## 6 01011 AL 2021-12-06 4455. 151272. 99.0 297.
## 7 01013 AL 2021-12-06 5193. 177139. 103. 3497.
## 8 01015 AL 2021-12-06 4577. 199252. 106. 2280.
## 9 01017 AL 2021-12-06 4270. 174716. -60.1 571.
## 10 01019 AL 2021-12-06 2443. 122347. 76.3 4161.
## # ... with 3,132 more rows
# Negative values in the data (likely due to restatement)
burdenSummary %>%
pivot_longer(-c(countyFIPS, state, asofDate)) %>%
filter(value < 0)
## # A tibble: 19 x 5
## countyFIPS state asofDate name value
## <chr> <chr> <date> <chr> <dbl>
## 1 01017 AL 2021-12-06 dpm28 -60.1
## 2 01027 AL 2021-12-06 dpm28 -75.6
## 3 01031 AL 2021-12-06 dpm28 -19.1
## 4 13037 GA 2021-12-06 cpm28 -646.
## 5 13099 GA 2021-12-06 cpm28 -196.
## 6 20001 KS 2021-12-06 dpm28 -80.8
## 7 20021 KS 2021-12-06 dpm28 -50.2
## 8 20029 KS 2021-12-06 dpm28 -228.
## 9 20041 KS 2021-12-06 dpm28 -54.2
## 10 20051 KS 2021-12-06 dpm28 -35.0
## 11 20127 KS 2021-12-06 dpm28 -178.
## 12 20157 KS 2021-12-06 dpm28 -216.
## 13 46027 SD 2021-12-06 dpm28 -71.1
## 14 48001 TX 2021-12-06 cpm28 -1334.
## 15 48145 TX 2021-12-06 cpm28 -231.
## 16 48253 TX 2021-12-06 cpm28 -846.
## 17 51595 VA 2021-12-06 cpm28 -1122.
## 18 51620 VA 2021-12-06 dpm28 -126.
## 19 51730 VA 2021-12-06 dpm28 -31.9
# Create histograms for burden metrics, with all values made no less than 0
metricLabels <- c("cpm28"="Cases per million (most recent 28 days)",
"dpm28"="Deaths per million (most recent 28 days)",
"tcpm"="Cases per million (aggregate)",
"tdpm"="Deaths per million (aggregate)"
)
burdenSummary %>%
pivot_longer(-c(countyFIPS, state, asofDate)) %>%
ggplot(aes(x=pmax(0, value))) +
geom_histogram(fill="lightblue", bins=50) +
facet_wrap(~metricLabels[name], scales="free") +
labs(title="Coronavirus burden by county as of Dec 8, 2021",
subtitle="Values less than 0 treated as 0",
caption="Data from USA Facts",
x="Burden",
y="# Counties"
)
# Create plot of cases (past 28 days)
burdenSummary %>%
select(fips=countyFIPS, cpm28) %>%
mutate(cpm28=pmin(pmax(cpm28, 0), 25000)/1000) %>%
usmap::plot_usmap(regions="counties", exclude="", data=., values="cpm28") +
scale_fill_continuous("CPM 000s\n(28 days)", low="grey", high="red") +
labs(title="Cases per million (past 28 days) by county as of Dec 8, 2021",
subtitle="Capped at 25k",
caption="Source: USA Facts"
)
# Create plot of deaths (past 28 days)
burdenSummary %>%
select(fips=countyFIPS, dpm28) %>%
mutate(dpm28=pmin(pmax(dpm28, 0), 250)) %>%
usmap::plot_usmap(regions="counties", exclude=c("NE", "FL", "NJ", "OK"), data=., values="dpm28") +
scale_fill_continuous("DPM 000s\n(28 days)", low="grey", high="red") +
labs(title="Deaths per million (past 28 days) by county as of Dec 8, 2021",
subtitle="Capped at 250",
caption="Source: USA Facts\nFL, NE, NJ, OK excluded due to unreliable data"
)
A function is written to create burden summary data for a specified time period:
makeBurdenSummary <- function(lst,
groupVar=c("countyFIPS", "state"),
numVarFinal=c("tdpm", "tcpm"),
numVarSum=c("dpm", "cpm"),
keyDate=NULL,
dateRange=28
) {
# FUNCTION ARGUMENTS
# lst: list of processed county burden data
# groupVar: grouping variables for the final dataset
# numVarFinal: numeric variables to pull data from the key date
# numVarSum: numeric variables to sum from the key date interval
# keyDate: the key date for the summaries (NULL means use maximum in data)
# dateRange: number of days to include in the numeric interval summaries
# Find keyDate if not provided, convert to date if not already
if(is.null(keyDate)) keyDate <- lst[["dfPerCapita"]] %>% pull(date) %>% max()
if(!("Date" %in% class(keyDate))) keyDate <- as.Date(keyDate)
# Create summary
df <- lst[["dfPerCapita"]] %>%
group_by_at(all_of(groupVar)) %>%
summarize(asofDate=keyDate,
across(all_of(numVarFinal), .fns=~sum(ifelse(date==keyDate, .x, 0))),
across(all_of(numVarSum),
.fns=~sum(ifelse(date>keyDate-dateRange & date<=keyDate, .x, 0)),
.names=paste0("{.col}", as.character(dateRange))
),
.groups="drop"
)
# Return the data frame
df
}
# Function to make plot from county-level data
plotCountyData <- function(df,
fipsVar="countyFIPS",
numVar="cpm28",
allGT=0,
allLT=25000,
divBy=1000,
plotLegend="CPM 000s\n(28 days)",
thruDate=NULL,
plotLabel="Cases per million (past 28 days) by county as of: "
) {
# FUNCTION ARGUMENTS:
# df: data frame containing county data
# fipsVar: variable name containing FIPS code for county
# numVar: numeric variable to plot
# allGT: numVar will be floored at allGT prior to plotting
# allLT: numVar will be capped at allLT prior to plotting
# divBy: numVar will be divided by divBy prior to plotting
# plotLegend: label to use for plot legend
# thruDate: date that plots are through (NULL means get from data)
# plotLabel: main title for the plot
# Get thruDate if not provided
if(is.null(thruDate)) thruDate <- df %>% pull(asofDate) %>% max() %>% as.character()
p1 <- df %>%
select(all_of(c(fipsVar, numVar))) %>%
purrr::set_names(all_of(c("fips", numVar))) %>%
mutate(across(all_of(numVar), .fns=~pmin(pmax(.x, allGT), allLT)/divBy)) %>%
usmap::plot_usmap(regions="counties", exclude="", data=., values=numVar) +
scale_fill_continuous(plotLegend, low="grey", high="red") +
labs(title=paste0(plotLabel, thruDate),
subtitle="Floors and ceilings applied",
caption="Source: USA Facts"
)
# Return the plot object
p1
}
makeBurdenSummary(cty_newdata_20211208)
## # A tibble: 3,142 x 7
## countyFIPS state asofDate tdpm tcpm dpm28 cpm28
## <chr> <chr> <date> <dbl> <dbl> <dbl> <dbl>
## 1 01001 AL 2021-12-06 2810. 189049. 107. 4063.
## 2 01003 AL 2021-12-06 2638. 171188. 62.7 2491.
## 3 01005 AL 2021-12-06 3241. 150207. 40.5 3808.
## 4 01007 AL 2021-12-06 4242. 194874. 134. 3081.
## 5 01009 AL 2021-12-06 3338. 186612. 121. 4444.
## 6 01011 AL 2021-12-06 4455. 151272. 99.0 297.
## 7 01013 AL 2021-12-06 5193. 177139. 103. 3497.
## 8 01015 AL 2021-12-06 4577. 199252. 106. 2280.
## 9 01017 AL 2021-12-06 4270. 174716. -60.1 571.
## 10 01019 AL 2021-12-06 2443. 122347. 76.3 4161.
## # ... with 3,132 more rows
makeBurdenSummary(cty_newdata_20211208, keyDate="2021-06-30")
## # A tibble: 3,142 x 7
## countyFIPS state asofDate tdpm tcpm dpm28 cpm28
## <chr> <chr> <date> <dbl> <dbl> <dbl> <dbl>
## 1 01001 AL 2021-06-30 2023. 129893. 35.8 1718.
## 2 01003 AL 2021-06-30 1411. 98672. 17.9 1581.
## 3 01005 AL 2021-06-30 2431. 95034. 40.5 243.
## 4 01007 AL 2021-06-30 2858. 120255. 0 1250.
## 5 01009 AL 2021-06-30 2404. 120828. 0 1695.
## 6 01011 AL 2021-06-30 4158. 123651. 0 1287.
## 7 01013 AL 2021-06-30 3651. 116310. 0 2160.
## 8 01015 AL 2021-06-30 2905. 130065. 52.8 1188.
## 9 01017 AL 2021-06-30 3699. 112347. -30.1 1474.
## 10 01019 AL 2021-06-30 1718. 71538. 0 344.
## # ... with 3,132 more rows
makeBurdenSummary(cty_newdata_20211208, keyDate="2021-10-31", dateRange=364)
## # A tibble: 3,142 x 7
## countyFIPS state asofDate tdpm tcpm dpm364 cpm364
## <chr> <chr> <date> <dbl> <dbl> <dbl> <dbl>
## 1 01001 AL 2021-10-31 2649. 183071. 2112. 144177.
## 2 01003 AL 2021-10-31 2486. 167412. 2168. 136207.
## 3 01005 AL 2021-10-31 3079. 145670. 2714. 102690.
## 4 01007 AL 2021-10-31 3885. 190944. 3215. 151737.
## 5 01009 AL 2021-10-31 3078. 179348. 2646. 143119.
## 6 01011 AL 2021-10-31 4356. 151173. 2673. 86328.
## 7 01013 AL 2021-10-31 4988. 172049. 2879. 119395.
## 8 01015 AL 2021-10-31 4331. 196523. 3759. 153462.
## 9 01017 AL 2021-10-31 4270. 173844. 2857. 131984.
## 10 01019 AL 2021-10-31 2329. 117041. 1756. 86998.
## # ... with 3,132 more rows
makeBurdenSummary(cty_newdata_20211208, keyDate="2021-11-30", dateRange=91) %>%
plotCountyData(numVar="cpm91",
allLT=75000,
plotLegend="CPM 000s\n(91 days)",
plotLabel="Cases per million (past 91 days) by county as of: "
)
makeBurdenSummary(cty_newdata_20211208, keyDate="2021-08-31", dateRange=91) %>%
plotCountyData(numVar="cpm91",
allLT=75000,
plotLegend="CPM 000s\n(91 days)",
plotLabel="Cases per million (past 91 days) by county as of: "
)
makeBurdenSummary(cty_newdata_20211208, keyDate="2021-05-31", dateRange=91) %>%
plotCountyData(numVar="cpm91",
allLT=75000,
plotLegend="CPM 000s\n(91 days)",
plotLabel="Cases per million (past 91 days) by county as of: "
)
makeBurdenSummary(cty_newdata_20211208, keyDate="2021-02-28", dateRange=91) %>%
plotCountyData(numVar="cpm91",
allLT=75000,
plotLegend="CPM 000s\n(91 days)",
plotLabel="Cases per million (past 91 days) by county as of: "
)
Next steps are to add better scaling and labeling capability for different time periods:
helperPlotCountyBurden <- function(ctyData,
keyDate,
dateRange=28,
dataType="cpm",
divBy=if(dataType=="cpm") 1000 else 1,
allLT=if(dataType=="cpm") 25000*dateRange/28 else 250*dateRange/28
) {
# FUNCTION ARGUMENTS:
# ctyData: processed county level data file
# keyDate: the key date (data will be up through this date)
# dateRange: the number of dates in the data interval
# dataType: the type of data (currently handles CPM and DPM)
# divBy: metric plotted should be divided by this number
# allLT: plots will be capped at this level of burden
makeBurdenSummary(ctyData, keyDate=keyDate, dateRange=dateRange) %>%
plotCountyData(numVar=paste0(dataType, dateRange),
allLT=allLT,
divBy=divBy,
plotLegend=paste0(stringr::str_to_upper(dataType),
if(divBy==1000) " 000s" else if(divBy==1) "" else paste0("Units: ", divBy),
"\n(",
dateRange,
" days)"),
plotLabel=paste0(if(dataType=="cpm") "Cases" else if(dataType=="dpm") "Deaths" else dataType,
" per million (past ",
dateRange,
" days) by county as of: "
)
)
}
helperPlotCountyBurden(cty_newdata_20211208, keyDate="2021-11-30", dateRange=91)
helperPlotCountyBurden(cty_newdata_20211208, keyDate="2021-11-30", dataType="dpm", dateRange=91)
helperPlotCountyBurden(cty_newdata_20211208, keyDate="2020-04-30", dateRange=56)
helperPlotCountyBurden(cty_newdata_20211208, keyDate="2020-04-30", dateRange=56, dataType="dpm")
Faceted plots can also be created:
dfTempBurden <- as.Date("2021-10-31") %>% lubridate::add_with_rollback(months(seq(0, -15, by=-5))) %>%
purrr::map_dfr(.f=~makeBurdenSummary(cty_newdata_20211208, keyDate=.x, dateRange=154))
dfTempBurden
## # A tibble: 12,568 x 7
## countyFIPS state asofDate tdpm tcpm dpm154 cpm154
## <chr> <chr> <date> <dbl> <dbl> <dbl> <dbl>
## 1 01001 AL 2021-10-31 2649. 183071. 680. 55236.
## 2 01003 AL 2021-10-31 2486. 167412. 1093. 70563.
## 3 01005 AL 2021-10-31 3079. 145670. 689. 51122.
## 4 01007 AL 2021-10-31 3885. 190944. 1027. 71984.
## 5 01009 AL 2021-10-31 3078. 179348. 674. 60647.
## 6 01011 AL 2021-10-31 4356. 151173. 198. 29106.
## 7 01013 AL 2021-10-31 4988. 172049. 1337. 57949.
## 8 01015 AL 2021-10-31 4331. 196523. 1479. 67814.
## 9 01017 AL 2021-10-31 4270. 173844. 571. 63631.
## 10 01019 AL 2021-10-31 2329. 117041. 611. 45961.
## # ... with 12,558 more rows
dfTempBurden %>%
rename(fips=countyFIPS) %>%
mutate(burden=pmax(pmin(dpm154, 1500), 0)) %>%
select(fips, burden, asofDate) %>%
usmap::plot_usmap(regions="counties", data=., values="burden") +
labs(title="5-month coronavirus deaths by county",
subtitle="Floors and ceilings applied",
caption="Source: USA Facts"
) +
scale_fill_continuous("DPM\n5-month", low="grey", high="red") +
facet_wrap(~asofDate) +
theme(legend.position="bottom")
Faceted plots can also be created on different time periods:
dfTempBurden <- as.Date("2021-11-30") %>% lubridate::add_with_rollback(months(c(0, -2, -4, -6))) %>%
purrr::map_dfr(.f=~makeBurdenSummary(cty_newdata_20211208, keyDate=.x, dateRange=35))
dfTempBurden
## # A tibble: 12,568 x 7
## countyFIPS state asofDate tdpm tcpm dpm35 cpm35
## <chr> <chr> <date> <dbl> <dbl> <dbl> <dbl>
## 1 01001 AL 2021-11-30 2810. 188405. 161. 8323.
## 2 01003 AL 2021-11-30 2638. 170691. 166. 3669.
## 3 01005 AL 2021-11-30 3241. 149842. 162. 4699.
## 4 01007 AL 2021-11-30 4198. 194025. 357. 3974.
## 5 01009 AL 2021-11-30 3320. 185539. 277. 8681.
## 6 01011 AL 2021-11-30 4455. 150975. 99.0 693.
## 7 01013 AL 2021-11-30 5142. 176831. 206. 7919.
## 8 01015 AL 2021-11-30 4560. 198917. 246. 3019.
## 9 01017 AL 2021-11-30 4270. 173964. 0 5112.
## 10 01019 AL 2021-11-30 2405. 121507. 115. 5764.
## # ... with 12,558 more rows
dfTempBurden %>%
rename(fips=countyFIPS) %>%
mutate(burden=pmax(pmin(cpm35, 30000), 0)/1000) %>%
select(fips, burden, asofDate) %>%
usmap::plot_usmap(regions="counties", data=., values="burden") +
labs(title="5-week coronavirus cases by county",
subtitle="Floors and ceilings applied",
caption="Source: USA Facts"
) +
scale_fill_continuous("CPM(000)\n5-week", low="grey", high="red") +
facet_wrap(~asofDate) +
theme(legend.position="bottom")
dfTempBurden %>%
rename(fips=countyFIPS) %>%
mutate(burden=pmax(pmin(dpm35, 300), 0)) %>%
select(fips, burden, asofDate) %>%
usmap::plot_usmap(regions="counties", data=., values="burden") +
labs(title="5-week coronavirus deaths by county",
subtitle="Floors and ceilings applied",
caption="Source: USA Facts"
) +
scale_fill_continuous("DPM\n5-week", low="grey", high="red") +
facet_wrap(~asofDate) +
theme(legend.position="bottom")
Deaths data in particular requires further investigation for issues in GA, OK, KY, FL, NE, NJ, and potentially other areas. Exploration of Florida:
flCounties <- cty_newdata_20211208$dfPerCapita %>%
filter(state=="FL") %>%
select(countyFIPS, date, cases, new_cases, deaths, new_deaths) %>%
pivot_longer(-c(countyFIPS, date)) %>%
arrange(countyFIPS, date) %>%
group_by(countyFIPS, name) %>%
mutate(value7=zoo::rollmean(value, k=7, fill=NA, align="center")) %>%
ungroup()
flCounties
## # A tibble: 183,580 x 5
## countyFIPS date name value value7
## <chr> <date> <chr> <dbl> <dbl>
## 1 12001 2020-01-22 cases 0 NA
## 2 12001 2020-01-22 new_cases 0 NA
## 3 12001 2020-01-22 deaths 0 NA
## 4 12001 2020-01-22 new_deaths 0 NA
## 5 12001 2020-01-23 cases 0 NA
## 6 12001 2020-01-23 new_cases 0 NA
## 7 12001 2020-01-23 deaths 0 NA
## 8 12001 2020-01-23 new_deaths 0 NA
## 9 12001 2020-01-24 cases 0 NA
## 10 12001 2020-01-24 new_cases 0 NA
## # ... with 183,570 more rows
flCounties %>%
group_by(date, name) %>%
summarize(across(c(value, value7), .fns=sum), .groups="drop") %>%
filter(!is.na(value7)) %>%
ggplot(aes(x=date, y=value7)) +
geom_line() +
facet_wrap(~name, scales="free_y") +
labs(x=NULL,
y="Rolling 7-day mean",
title="Summed county burden by metric (7-day mean)",
subtitle="FL counties"
)
The USAF data source no longer captures Florida deaths. The process is updated to functional form:
checkStateCountySums <- function(lst,
keyStates,
printPlot=TRUE,
returnData=!isTRUE(printPlot)
) {
# FUNCTION ARGUMENTS
# lst: a processed list of county data
# keyStates: character vector of state abbreviations to use
# printPlot: boolean, should the plot be printed?
# returnData: boolean, should the aggregated data frame be returned?
# Create the data frame of counties in keyStates
df <- lst[["dfPerCapita"]] %>%
filter(state %in% all_of(keyStates)) %>%
select(state, countyFIPS, date, cases, new_cases, deaths, new_deaths) %>%
pivot_longer(-c(state, countyFIPS, date)) %>%
arrange(state, countyFIPS, date) %>%
group_by(state, countyFIPS, name) %>%
mutate(value7=zoo::rollmean(value, k=7, fill=NA, align="center")) %>%
ungroup()
# Create the aggregated data
dfAgg <- df %>%
group_by(date, name) %>%
summarize(across(c(value, value7), .fns=sum), .groups="drop")
# Plot either the aggregate dat
p1 <- dfAgg %>%
filter(!is.na(value7)) %>%
ggplot(aes(x=date, y=value7)) +
geom_line() +
facet_wrap(~name, scales="free_y") +
labs(x=NULL,
y="Rolling 7-day mean",
title="Summed county burden by metric (7-day mean)",
subtitle=paste0("Sum of counties in: ", paste0(all_of(keyStates), collapse=", "))
)
# Print the plot if requested
if(printPlot) print(p1)
# Return the data if requested
if(returnData) return(dfAgg)
}
checkStateCountySums(cty_newdata_20211208, keyStates="FL")
checkStateCountySums(cty_newdata_20211208, keyStates=c("FL", "GA", "SC", "AL"))
checkStateCountySums(cty_newdata_20211208, keyStates=c("FL", "GA", "SC", "AL"), printPlot=FALSE)
## # A tibble: 2,740 x 4
## date name value value7
## <date> <chr> <dbl> <dbl>
## 1 2020-01-22 cases 0 NA
## 2 2020-01-22 deaths 0 NA
## 3 2020-01-22 new_cases 0 NA
## 4 2020-01-22 new_deaths 0 NA
## 5 2020-01-23 cases 0 NA
## 6 2020-01-23 deaths 0 NA
## 7 2020-01-23 new_cases 0 NA
## 8 2020-01-23 new_deaths 0 NA
## 9 2020-01-24 cases 0 NA
## 10 2020-01-24 deaths 0 NA
## # ... with 2,730 more rows
Several other states with potential data anomalies are explored:
# States that no longer report county-level deaths
checkStateCountySums(cty_newdata_20211208, keyStates="FL")
checkStateCountySums(cty_newdata_20211208, keyStates="NE")
checkStateCountySums(cty_newdata_20211208, keyStates="NJ")
# States with one or more large spikes in deaths (data restatement)
checkStateCountySums(cty_newdata_20211208, keyStates="GA")
checkStateCountySums(cty_newdata_20211208, keyStates="KY")
checkStateCountySums(cty_newdata_20211208, keyStates="MO")
checkStateCountySums(cty_newdata_20211208, keyStates="OK")
Next steps are to compare metrics against state-level reported totals:
statePerCapita <- readFromRDS("cdc_daily_211202")$dfPerCapita
flState <- statePerCapita %>%
select(date, state, cases=tot_cases, new_cases, deaths=tot_deaths, new_deaths) %>%
filter(state=="FL", date <= "2021-12-01") %>%
pivot_longer(-c(date, state)) %>%
mutate(value=ifelse(is.na(value), 0, value)) %>%
arrange(date, state, name) %>%
group_by(state, name) %>%
mutate(value7=zoo::rollmean(value, k=7, fill=NA, align="center"), src="state") %>%
filter(!is.na(value7)) %>%
ungroup()
flState
## # A tibble: 2,696 x 6
## date state name value value7 src
## <date> <chr> <chr> <dbl> <dbl> <chr>
## 1 2020-01-25 FL cases 0 0 state
## 2 2020-01-25 FL deaths 0 0 state
## 3 2020-01-25 FL new_cases 0 0 state
## 4 2020-01-25 FL new_deaths 0 0 state
## 5 2020-01-26 FL cases 0 0 state
## 6 2020-01-26 FL deaths 0 0 state
## 7 2020-01-26 FL new_cases 0 0 state
## 8 2020-01-26 FL new_deaths 0 0 state
## 9 2020-01-27 FL cases 0 0 state
## 10 2020-01-27 FL deaths 0 0 state
## # ... with 2,686 more rows
flCounties %>%
group_by(date, name) %>%
summarize(across(c(value, value7), .fns=sum), .groups="drop") %>%
filter(!is.na(value7)) %>%
mutate(state="FL", src="county") %>%
bind_rows(flState) %>%
ggplot(aes(x=date, y=value7)) +
geom_line(aes(group=src, color=src)) +
facet_wrap(~name, scales="free_y") +
labs(x=NULL,
y="Rolling 7-day mean",
title="Summed county burden by metric (7-day mean)",
subtitle="FL counties"
)
Deaths data in Florida appear generally aligned through mid-2021, when state data continues reporting and county data stops. Cases data in Florida appear generally aligned throughout.
The process is converted to functional form:
compareStateSummedCounty <- function(dfState,
dfCounty,
inclStates,
dateThru=NULL,
makePlot=TRUE,
returnData=FALSE
) {
# FUNCTION ARGUMENTS:
# dfState: processed state-level metrics
# dfCounty: processed county-level metrics
# inclStates: character vector of states to include
# dateThru: latest date for analyzsis (NULL means use all data)
# makePlot: boolean, should plots be created for each state?
# returnData: boolean, should a list of processed dfState and processed dfCounty be returned?
dfState <- dfState %>%
select(date, state, cases=tot_cases, new_cases, deaths=tot_deaths, new_deaths) %>%
filter(state %in% all_of(inclStates),
date <= if(is.null(dateThru)) max(date) else as.Date(dateThru)
) %>%
pivot_longer(-c(date, state)) %>%
mutate(value=ifelse(is.na(value), 0, value)) %>%
arrange(date, state, name) %>%
group_by(state, name) %>%
mutate(value7=zoo::rollmean(value, k=7, fill=NA, align="center"), src="state") %>%
ungroup()
dfCounty <- dfCounty %>%
filter(state %in% all_of(inclStates),
date <= if(is.null(dateThru)) max(date) else as.Date(dateThru)
) %>%
select(state, countyFIPS, date, cases, new_cases, deaths, new_deaths) %>%
pivot_longer(-c(state, countyFIPS, date)) %>%
arrange(state, countyFIPS, date) %>%
group_by(state, countyFIPS, name) %>%
mutate(value7=zoo::rollmean(value, k=7, fill=NA, align="center")) %>%
ungroup() %>%
filter(state %in% all_of(inclStates)) %>%
group_by(state, date, name) %>%
summarize(across(c(value, value7), .fns=sum), .groups="drop") %>%
filter(!is.na(value7)) %>%
mutate(src="county")
if(isTRUE(makePlot)) {
for(thisState in all_of(inclStates)) {
p1 <- dfCounty %>%
filter(state %in% all_of(thisState)) %>%
bind_rows(filter(dfState, state %in% all_of(thisState))) %>%
ggplot(aes(x=date, y=value7)) +
geom_line(aes(group=src, color=src)) +
facet_wrap(~name, scales="free_y") +
labs(x=NULL,
y="Rolling 7-day mean",
title="Summed county burden by metric (7-day mean)",
subtitle=paste0(thisState, " counties")
)
print(p1)
}
}
if(returnData) list(dfState=dfState, dfCounty=dfCounty)
}
statePerCapita <- readFromRDS("cdc_daily_211202")$dfPerCapita
countyData <- cty_newdata_20211208$dfPerCapita
tempStateCompareList <- compareStateSummedCounty(dfState=statePerCapita,
dfCounty=countyData,
inclStates=c("FL", "MO", "OK", "TX"),
dateThru="2021-11-30",
returnData=TRUE
)
## Warning: Removed 6 row(s) containing missing values (geom_path).
## Warning: Removed 6 row(s) containing missing values (geom_path).
## Warning: Removed 6 row(s) containing missing values (geom_path).
## Warning: Removed 6 row(s) containing missing values (geom_path).
tempStateCompareList
## $dfState
## # A tibble: 10,948 x 6
## date state name value value7 src
## <date> <chr> <chr> <dbl> <dbl> <chr>
## 1 2020-01-01 TX cases 0 NA state
## 2 2020-01-01 TX deaths 0 NA state
## 3 2020-01-01 TX new_cases 0 NA state
## 4 2020-01-01 TX new_deaths 0 NA state
## 5 2020-01-02 TX cases 0 NA state
## 6 2020-01-02 TX deaths 0 NA state
## 7 2020-01-02 TX new_cases 0 NA state
## 8 2020-01-02 TX new_deaths 0 NA state
## 9 2020-01-03 TX cases 0 NA state
## 10 2020-01-03 TX deaths 0 NA state
## # ... with 10,938 more rows
##
## $dfCounty
## # A tibble: 10,768 x 6
## state date name value value7 src
## <chr> <date> <chr> <dbl> <dbl> <chr>
## 1 FL 2020-01-25 cases 0 0 county
## 2 FL 2020-01-25 deaths 0 0 county
## 3 FL 2020-01-25 new_cases 0 0 county
## 4 FL 2020-01-25 new_deaths 0 0 county
## 5 FL 2020-01-26 cases 0 0 county
## 6 FL 2020-01-26 deaths 0 0 county
## 7 FL 2020-01-26 new_cases 0 0 county
## 8 FL 2020-01-26 new_deaths 0 0 county
## 9 FL 2020-01-27 cases 0 0 county
## 10 FL 2020-01-27 deaths 0 0 county
## # ... with 10,758 more rows
A scoring metric is added for similarity of the data:
tempStateCompareList$dfState %>%
filter(!is.na(value7), date >= "2020-02-15", date <= "2021-11-30") %>%
select(date, state, name, state7=value7) %>%
inner_join(tempStateCompareList$dfCounty %>%
filter(!is.na(value7), date >= "2020-02-15", date <= "2021-11-30") %>%
select(date, state, name, county7=value7),
by=c("state", "name", "date")
) %>%
group_by(state, name) %>%
summarize(vars7=var(state7),
varc7=var(county7),
vardelta=var(state7-county7),
corc7s7=cor(state7, county7),
n=n(),
.groups="drop"
) %>%
mutate(varExp=1-2*vardelta/pmax(vars7, varc7)) %>%
ggplot(aes(x=state, y=varExp)) +
geom_col(fill="lightblue") +
geom_text(aes(y=varExp/2, label=round(varExp, 2))) +
facet_wrap(~name) +
lims(y=c(NA, 1)) +
labs(x=NULL, y="Rough estimate of variance explained", title="Similarity of state and summed county metrics")
Cases are generally well aligned. There are differences in timing of deaths, and spikiness that leads to much different curves in (especially) FL and OK.
The scoring metric is converted to functional form:
scoreSimilarity <- function(lst,
minDate=NULL,
maxDate=NULL,
makeFacet=TRUE
) {
# FUNCTION ARGUMENTS:
# lst: list containing state-level and summed county-level data
# minDate: starting date for scoring comparisons (NULL means lowest common date in data)
# maxDate: ending date for scoring comparisons (NULL means highest common date in data)
# makeFacet: boolean, should plots be facetted, or by variable?
# if not TRUE, plots will be made sideways and sorted by variable
# Create minDate and maxDate if not provided
if(is.null(minDate)) minDate <- max(min(lst[["dfState"]]$date), min(lst[["dfCounty"]]$date))
if(is.null(maxDate)) maxDate <- min(max(lst[["dfState"]]$date), max(lst[["dfCounty"]]$date))
plotData <- lst[["dfState"]] %>%
filter(!is.na(value7), date >= minDate, date <= maxDate) %>%
select(date, state, name, state7=value7) %>%
inner_join(lst[["dfCounty"]] %>%
filter(!is.na(value7), date >= minDate, date <= maxDate) %>%
select(date, state, name, county7=value7),
by=c("state", "name", "date")
) %>%
group_by(state, name) %>%
summarize(vars7=var(state7),
varc7=var(county7),
vardelta=var(state7-county7),
corc7s7=cor(state7, county7),
n=n(),
.groups="drop"
) %>%
mutate(varExp=1-2*vardelta/pmax(vars7, varc7))
innerMakePlot <- function(keepName, coordFlip=FALSE) {
p1 <- plotData %>%
filter(name %in% all_of(keepName)) %>%
ggplot(aes(x=fct_reorder(state, varExp), y=varExp)) +
geom_col(fill="lightblue") +
geom_text(aes(y=varExp/2, label=round(varExp, 2))) +
lims(y=c(NA, 1)) +
labs(x=NULL,
y="Rough estimate of variance explained",
title="Similarity of state and summed county metrics",
subtitle=paste0("Time period: ", minDate, " to ", maxDate)
) +
facet_wrap(~name)
if(isTRUE(coordFlip)) p1 <- p1 + coord_flip()
p1
}
keyVars <- plotData %>% pull(name) %>% unique() %>% sort()
if(!isTRUE(makeFacet)) {
for(varName in keyVars) innerMakePlot(keepName=varName, coordFlip=TRUE) %>% print()
} else {
innerMakePlot(keepName=keyVars) %>% print()
}
}
scoreSimilarity(tempStateCompareList)
scoreSimilarity(tempStateCompareList, minDate="2020-02-15")
scoreSimilarity(tempStateCompareList, maxDate="2021-11-27")
scoreSimilarity(tempStateCompareList, minDate="2020-02-15", maxDate="2021-11-27")
The process is run for all states:
tempStateCompareList_v2 <- compareStateSummedCounty(dfState=statePerCapita,
dfCounty=countyData,
inclStates=c(state.abb, "DC"),
dateThru="2021-11-30",
makePlot=FALSE,
returnData=TRUE
)
scoreSimilarity(tempStateCompareList_v2, minDate="2020-02-15", maxDate="2021-11-27", makeFacet=FALSE)
Data are generally well aligned between sources, with some exceptions that merit further investigation:
# Large difference in cumulative cases
compareStateSummedCounty(dfState=statePerCapita,
dfCounty=countyData,
inclStates=c("GA"),
dateThru="2021-11-30"
)
## Warning: Removed 6 row(s) containing missing values (geom_path).
# Large difference in cumulative deaths
compareStateSummedCounty(dfState=statePerCapita,
dfCounty=countyData,
inclStates=c("FL", "OK", "NE", "KY"),
dateThru="2021-11-30"
)
## Warning: Removed 6 row(s) containing missing values (geom_path).
## Warning: Removed 6 row(s) containing missing values (geom_path).
## Warning: Removed 6 row(s) containing missing values (geom_path).
## Warning: Removed 6 row(s) containing missing values (geom_path).
# Large difference in daily cases
compareStateSummedCounty(dfState=statePerCapita,
dfCounty=countyData,
inclStates=c("WV", "NJ", "MA"),
dateThru="2021-11-30"
)
## Warning: Removed 6 row(s) containing missing values (geom_path).
## Warning: Removed 6 row(s) containing missing values (geom_path).
## Warning: Removed 6 row(s) containing missing values (geom_path).
# Large differences in daily deaths
compareStateSummedCounty(dfState=statePerCapita,
dfCounty=countyData,
inclStates=c("MO", "RI", "KS", "OH"),
dateThru="2021-11-30"
)
## Warning: Removed 6 row(s) containing missing values (geom_path).
## Warning: Removed 6 row(s) containing missing values (geom_path).
## Warning: Removed 6 row(s) containing missing values (geom_path).
## Warning: Removed 6 row(s) containing missing values (geom_path).
Summaries are refreshed with the latest data:
urlMapper[["usafCase"]] <- "https://static.usafacts.org/public/data/covid-19/covid_confirmed_usafacts.csv"
urlMapper[["usafDeath"]] <- "https://static.usafacts.org/public/data/covid-19/covid_deaths_usafacts.csv"
urlMapper[["usafPop"]] <- "https://static.usafacts.org/public/data/covid-19/covid_county_population_usafacts.csv"
readList <- list("usafCase"="./RInputFiles/Coronavirus/covid_confirmed_usafacts_downloaded_20220104.csv",
"usafDeath"="./RInputFiles/Coronavirus/covid_deaths_usafacts_downloaded_20220104.csv"
)
compareList <- list("usafCase"=readFromRDS("cty_newdata_20211208")$dfRaw$usafCase,
"usafDeath"=readFromRDS("cty_newdata_20211208")$dfRaw$usafDeath
)
# Create new clusters
cty_newdata_20220104 <- readRunUSAFacts(maxDate="2022-01-02",
downloadTo=lapply(readList,
FUN=function(x) if(file.exists(x)) NA else x
),
readFrom=readList,
compareFile=compareList,
writeLog="./RInputFiles/Coronavirus/USAF_NewData_20220104_v005.log",
ovrwriteLog=TRUE,
useClusters=readFromRDS("cty_newdata_20210813")$useClusters,
skipAssessmentPlots=FALSE,
brewPalette="Paired"
)
##
## No file has been downloaded, will use existing file: ./RInputFiles/Coronavirus/covid_confirmed_usafacts_downloaded_20220104.csv
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## `County Name` = col_character(),
## State = col_character(),
## StateFIPS = col_character()
## )
## i Use `spec()` for the full column specifications.
##
## *** File has been checked for uniqueness by: countyFIPS countyName state stateFIPS
##
##
## *** File has been checked for uniqueness by: countyFIPS stateFIPS date
##
##
## Checking for similarity of: column names
## In reference but not in current:
## In current but not in reference:
##
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 27
## Detailed differences available in: ./RInputFiles/Coronavirus/USAF_NewData_20220104_v005.log
##
## Checking for similarity of: county
## In reference but not in current:
## In current but not in reference:
##
##
## ***Differences of at least 5 and at least 5%
##
## 2 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20220104_v005.log
##
##
## ***Differences of at least 0 and at least 0.1%
##
## 205 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20220104_v005.log
##
##
## No file has been downloaded, will use existing file: ./RInputFiles/Coronavirus/covid_deaths_usafacts_downloaded_20220104.csv
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## `County Name` = col_character(),
## State = col_character(),
## StateFIPS = col_character()
## )
## i Use `spec()` for the full column specifications.
##
## *** File has been checked for uniqueness by: countyFIPS countyName state stateFIPS
##
##
## *** File has been checked for uniqueness by: countyFIPS stateFIPS date
##
##
## Checking for similarity of: column names
## In reference but not in current:
## In current but not in reference:
##
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 27
## Detailed differences available in: ./RInputFiles/Coronavirus/USAF_NewData_20220104_v005.log
##
## Checking for similarity of: county
## In reference but not in current:
## In current but not in reference:
##
##
## ***Differences of at least 5 and at least 5%
##
## 2 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20220104_v005.log
##
##
## ***Differences of at least 0 and at least 0.1%
##
## 197 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20220104_v005.log
##
##
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 4
## isType cases new_cases n
## <chr> <dbl> <dbl> <dbl>
## 1 before 1.48e+10 5.36e+7 2273416
## 2 after 1.48e+10 5.33e+7 2237104
## 3 pctchg 4.71e- 3 4.35e-3 0.0160
##
##
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 4
## isType deaths new_deaths n
## <chr> <dbl> <dbl> <dbl>
## 1 before 2.72e+8 818823 2273416
## 2 after 2.68e+8 788238 2237104
## 3 pctchg 1.61e-2 0.0374 0.0160
## NULL
# Plot all counties based on closest cluster
sparseCountyClusterMap(cty_newdata_20220104$useClusters,
caption="Includes only counties with 25k+ population",
brewPalette="viridis"
)
saveToRDS(cty_newdata_20220104, ovrWriteError=FALSE)
##
## File already exists: ./RInputFiles/Coronavirus/cty_newdata_20220104.RDS
##
## Not replacing the existing file since ovrWrite=FALSE
## NULL
County-level vaccine data can also be downloaded:
# File location
vaxFile <- "./RInputFiles/Coronavirus/county_vaccine_20220126.csv"
# Download the file if it has not yet been downloaded
if(!file.exists(vaxFile))
fileDownload(vafFile, url="https://data.cdc.gov/api/views/8xkx-amqh/rows.csv?accessType=DOWNLOAD")
# Read file and keep only select variables
fileRead(vaxFile, n_max=1000)
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## Date = col_character(),
## FIPS = col_character(),
## Recip_County = col_character(),
## Recip_State = col_character(),
## SVI_CTGY = col_character(),
## Metro_status = col_character()
## )
## i Use `spec()` for the full column specifications.
## # A tibble: 1,000 x 51
## Date FIPS MMWR_week Recip_County Recip_State Completeness_pct
## <chr> <chr> <dbl> <chr> <chr> <dbl>
## 1 01/25/2022 06031 4 Kings County CA 97.5
## 2 01/25/2022 08035 4 Douglas County CO 97.1
## 3 01/25/2022 24027 4 Howard County MD 98.2
## 4 01/25/2022 UNK 4 Unknown County MN 94.6
## 5 01/25/2022 37159 4 Rowan County NC 97
## 6 01/25/2022 36095 4 Schoharie County NY 97.6
## 7 01/25/2022 21023 4 Bracken County KY 94
## 8 01/25/2022 21121 4 Knox County KY 94
## 9 01/25/2022 50009 4 Essex County VT 73.6
## 10 01/25/2022 53075 4 Whitman County WA 96.1
## # ... with 990 more rows, and 45 more variables:
## # Administered_Dose1_Recip <dbl>, Administered_Dose1_Pop_Pct <dbl>,
## # Administered_Dose1_Recip_5Plus <dbl>,
## # Administered_Dose1_Recip_5PlusPop_Pct <dbl>,
## # Administered_Dose1_Recip_12Plus <dbl>,
## # Administered_Dose1_Recip_12PlusPop_Pct <dbl>,
## # Administered_Dose1_Recip_18Plus <dbl>,
## # Administered_Dose1_Recip_18PlusPop_Pct <dbl>,
## # Administered_Dose1_Recip_65Plus <dbl>,
## # Administered_Dose1_Recip_65PlusPop_Pct <dbl>, Series_Complete_Yes <dbl>,
## # Series_Complete_Pop_Pct <dbl>, Series_Complete_5Plus <dbl>,
## # Series_Complete_5PlusPop_Pct <dbl>, Series_Complete_12Plus <dbl>,
## # Series_Complete_12PlusPop_Pct <dbl>, Series_Complete_18Plus <dbl>,
## # Series_Complete_18PlusPop_Pct <dbl>, Series_Complete_65Plus <dbl>,
## # Series_Complete_65PlusPop_Pct <dbl>, Booster_Doses <dbl>,
## # Booster_Doses_Vax_Pct <dbl>, Booster_Doses_18Plus <dbl>,
## # Booster_Doses_18Plus_Vax_Pct <dbl>, Booster_Doses_50Plus <dbl>,
## # Booster_Doses_50Plus_Vax_Pct <dbl>, Booster_Doses_65Plus <dbl>,
## # Booster_Doses_65Plus_Vax_Pct <dbl>, SVI_CTGY <chr>,
## # Series_Complete_Pop_Pct_SVI <dbl>, Series_Complete_5PlusPop_Pct_SVI <dbl>,
## # Series_Complete_12PlusPop_Pct_SVI <dbl>,
## # Series_Complete_18PlusPop_Pct_SVI <dbl>,
## # Series_Complete_65PlusPop_Pct_SVI <dbl>, Metro_status <chr>,
## # Series_Complete_Pop_Pct_UR_Equity <dbl>,
## # Series_Complete_5PlusPop_Pct_UR_Equity <dbl>,
## # Series_Complete_12PlusPop_Pct_UR_Equity <dbl>,
## # Series_Complete_18PlusPop_Pct_UR_Equity <dbl>,
## # Series_Complete_65PlusPop_Pct_UR_Equity <dbl>, Census2019 <dbl>,
## # Census2019_5PlusPop <dbl>, Census2019_12PlusPop <dbl>,
## # Census2019_18PlusPop <dbl>, Census2019_65PlusPop <dbl>
vaxPartialRaw_20220126 <- fileRead(vaxFile) %>%
select(date=Date,
FIPS,
countyName=Recip_County,
state=Recip_State,
vxcpoppct=Series_Complete_Pop_Pct,
vxcgte18pct=Series_Complete_18PlusPop_Pct,
vxcgte65pct=Series_Complete_65PlusPop_Pct,
pop=Census2019,
popgte18=Census2019_18PlusPop,
popgte65=Census2019_65PlusPop
) %>%
mutate(date=lubridate::mdy(date))
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## Date = col_character(),
## FIPS = col_character(),
## Recip_County = col_character(),
## Recip_State = col_character(),
## SVI_CTGY = col_character(),
## Metro_status = col_character()
## )
## i Use `spec()` for the full column specifications.
vaxPartialRaw_20220126
## # A tibble: 1,342,224 x 10
## date FIPS countyName state vxcpoppct vxcgte18pct vxcgte65pct pop
## <date> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 2022-01-25 06031 Kings County CA 47.1 58 74.7 152940
## 2 2022-01-25 08035 Douglas Coun~ CO 71.6 82.4 95 351154
## 3 2022-01-25 24027 Howard County MD 82.9 92.6 95 325690
## 4 2022-01-25 UNK Unknown Coun~ MN 0 0 0 NA
## 5 2022-01-25 37159 Rowan County NC 42.9 51.7 73.7 142088
## 6 2022-01-25 36095 Schoharie Co~ NY 55.5 62 76.1 30999
## 7 2022-01-25 21023 Bracken Coun~ KY 45.8 56 83.7 8303
## 8 2022-01-25 21121 Knox County KY 35.5 43.8 59.2 31145
## 9 2022-01-25 50009 Essex County VT 39.3 43.5 51.9 6163
## 10 2022-01-25 53075 Whitman Coun~ WA 45.4 48.8 82.6 50104
## # ... with 1,342,214 more rows, and 2 more variables: popgte18 <dbl>,
## # popgte65 <dbl>
# Find state records other than 50 states plus DC
vaxPartialRaw_20220126 %>%
count(state) %>%
filter(!(state %in% c(state.abb, "DC")))
## # A tibble: 9 x 2
## state n
## <chr> <int>
## 1 AS 400
## 2 FM 405
## 3 GU 813
## 4 MH 392
## 5 MP 402
## 6 PR 32307
## 7 PW 399
## 8 UNK 308
## 9 VI 1626
# Range of dates include by state
vaxPartialRaw_20220126 %>%
filter(state %in% c(state.abb, "DC")) %>%
count(date, state) %>%
ggplot(aes(x=state, y=date)) +
geom_point(aes(color=n)) +
coord_flip() +
labs(x=NULL, y=NULL, title="Counties reporting by state and date")
# Box plot for vaccine data by state
for (keyVar in c("vxcpoppct", "vxcgte18pct", "vxcgte65pct")) {
p1 <- vaxPartialRaw_20220126 %>%
filter(date==max(date), state %in% c(state.abb, "DC"), FIPS != "UNK") %>%
select(date, state, all_of(keyVar)) %>%
purrr::set_names(c("date", "state", "vax")) %>%
ggplot(aes(x=fct_reorder(state, vax, median, na.rm=TRUE), y=vax)) +
geom_boxplot(fill="lightblue") +
coord_flip() +
labs(x=NULL, y="% fully vaccinated", title=paste0("Fully vaccinated by county as of latest date: ", keyVar))
print(p1)
}
There are some discontinuities in the data, and some states (e.g., Hawaii) do not report at the county-level. State-level totals are compared against county-level totals:
keyDate <- "2021-12-30"
# Get state-level totals
stateTemp <- readFromRDS("cdc_daily_220103")$dfPerCapita %>%
select(state, date, vxc, vxcpm, vxcpoppct, vxcgte18, vxcgte18pct, vxcgte65, vxcgte65pct) %>%
filter(state %in% c(state.abb, "DC"), date==keyDate) %>%
mutate(impPop=100*vxc/vxcpoppct, impPopgte18=100*vxcgte18/vxcgte18pct, impPopgte65=100*vxcgte65/vxcgte65pct)
# Get county-level state totals
countyTemp <- vaxPartialRaw_20220126 %>%
filter(state %in% c(state.abb, "DC"), FIPS != "UNK", date==keyDate) %>%
group_by(state) %>%
summarize(ctypoppct=sum(vxcpoppct*pop)/sum(pop),
ctygte18pct=sum(vxcgte18pct*popgte18)/sum(popgte18),
ctygte65pct=sum(vxcgte65pct*popgte65)/sum(popgte65),
across(starts_with("pop"), sum),
.groups="drop"
)
# Plot of state-level populations
p1 <- stateTemp %>%
ggplot(aes(x=fct_reorder(state, impPop))) +
geom_col(aes(y=impPop/1000000, fill="Under 18")) +
geom_col(aes(y=impPopgte18/1000000, fill="18-64")) +
geom_col(aes(y=impPopgte65/1000000, fill="65+")) +
coord_flip() +
labs(x=NULL,
y="Estimated population (millions)",
title="Implied population by state",
subtitle="Filled bars are from CDC state-level data"
) +
scale_fill_manual("CDC Age",
values=c("Under 18"="green", "18-64"="navy", "65+"="red"),
breaks=c("65+", "18-64", "Under 18")
) +
theme(legend.position="bottom")
# Plot of summed county-level populations
p2 <- countyTemp %>%
ggplot(aes(x=fct_reorder(state, pop))) +
geom_col(aes(y=pop/1000000, fill="Under 18")) +
geom_col(aes(y=popgte18/1000000, fill="18-64")) +
geom_col(aes(y=popgte65/1000000, fill="65+")) +
coord_flip() +
labs(x=NULL,
y="Estimated population (millions)",
title="Implied population by state",
subtitle="Filled bars are from summed county-level data"
) +
scale_fill_manual("Summed County Age",
values=c("Under 18"="green", "18-64"="navy", "65+"="red"),
breaks=c("65+", "18-64", "Under 18")
) +
theme(legend.position="bottom")
gridExtra::grid.arrange(p1, p2, nrow=1)
Percentage differences in population by source are also assessed:
# Integrated data
stateCountyLatest <- stateTemp %>%
full_join(countyTemp, by="state")
# Percentage differences in population
stateCountyLatest %>%
select(state, starts_with("imp"), starts_with("pop")) %>%
pivot_longer(-state) %>%
mutate(src=ifelse(str_detect(name, pattern="^imp"), "vaxcounty", "cdcstate"),
age=case_when(str_detect(name, pattern="gte18$") ~ "gte18",
str_detect(name, pattern="gte65") ~ "gte65",
TRUE ~ "all"
)
) %>%
select(state, age, src, value) %>%
pivot_wider(c("state", "age"), names_from="src", values_from="value") %>%
mutate(rat=vaxcounty/cdcstate) %>%
ggplot(aes(x=fct_reorder(state, rat, .fun=function(x) max(abs(x-1))))) +
geom_point(aes(y=rat)) +
geom_text(aes(y=rat-0.025, label=paste0(round(100*rat, 1), "%")), hjust=1, size=3) +
coord_flip() +
labs(x=NULL, y="Ratio (vax/CDC)", title="Comparison of population by source") +
facet_wrap(~age) +
lims(y=c(0, NA))
Vaccination rates by state as of 30-DEC-2021 are also compared:
# Vaccinated by age group by data source
vaxAgeTemp <- stateCountyLatest %>%
select(state, contains("pct")) %>%
pivot_longer(-state) %>%
mutate(src=ifelse(str_detect(name, pattern="^vxc"), "cdcstate", "vaxcounty"),
age=case_when(str_detect(name, pattern="gte18pct$") ~ "gte18",
str_detect(name, pattern="gte65pct") ~ "gte65",
TRUE ~ "all"
)
) %>%
select(state, age, src, value) %>%
pivot_wider(c("state", "age"), names_from="src", values_from="value") %>%
mutate(delta=vaxcounty-cdcstate)
# Vaccinated by state
vaxAgeTemp %>%
ggplot(aes(x=fct_reorder(state, vaxcounty, .fun=min))) +
geom_point(aes(y=cdcstate, color="State CDC")) +
geom_point(aes(y=vaxcounty, color="Vax Data")) +
coord_flip() +
facet_wrap(~age) +
labs(x=NULL,
y="% fully vaccinated as of 30-DEC-2021",
title="Comparison of fully vaccinated by state and data source"
) +
scale_color_discrete("Source") +
lims(y=c(0, 100))
# Difference in rate
vaxAgeTemp %>%
ggplot(aes(x=fct_reorder(state, delta))) +
geom_point(aes(y=delta)) +
geom_text(aes(y=delta-5, label=round(delta)), hjust=1, size=3) +
coord_flip() +
facet_wrap(~age) +
labs(x=NULL,
y="Difference % fully vaccinated as of 30-DEC-2021\n(Summed County - CDC State)",
title="Comparison of fully vaccinated by state and data source"
) +
geom_hline(yintercept=0, lty=2) +
lims(y=c(-100, NA))
Population data is very similar between the sources, with the exception of slightly more 65+ in northern New England reported in the summed county-level vaccinated data than in the baseline CDC data.
The summed county-level vaccinated data is consistently less than the reference CDC state data, though typically only slightly. Major differences (10%+ for at least one age cohort) are observed in HI (does not report at the county level), GA, VT, VA, MA, and NE.
Vaccinations over time are plotted for an example state:
# Repair the popgte65 column (mostly NA)
vaxPartialRaw_20220126 %>%
filter(state %in% c(state.abb, "DC"), FIPS != "UNK") %>%
is.na() %>%
colMeans()
## date FIPS countyName state vxcpoppct vxcgte18pct
## 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
## vxcgte65pct pop popgte18 popgte65
## 0.0000000 0.0000000 0.0000000 0.9462103
cty65 <- vaxPartialRaw_20220126 %>%
filter(state %in% c(state.abb, "DC"), FIPS != "UNK") %>%
select(state, FIPS, popgte65) %>%
group_by(state, FIPS) %>%
summarize(minpop=min(popgte65, na.rm=TRUE),
maxpop=max(popgte65, na.rm=TRUE),
nnA=sum(is.na(popgte65)),
n=n()
)
## `summarise()` has grouped output by 'state'. You can override using the `.groups` argument.
cty65
## # A tibble: 3,142 x 6
## # Groups: state [51]
## state FIPS minpop maxpop nnA n
## <chr> <chr> <dbl> <dbl> <int> <int>
## 1 AK 02013 351 351 387 409
## 2 AK 02016 419 419 387 409
## 3 AK 02020 33757 33757 387 409
## 4 AK 02050 1448 1448 387 409
## 5 AK 02060 136 136 387 409
## 6 AK 02068 238 238 387 409
## 7 AK 02070 494 494 387 409
## 8 AK 02090 10828 10828 387 409
## 9 AK 02100 548 548 387 409
## 10 AK 02105 516 516 387 409
## # ... with 3,132 more rows
cty65 %>% is.na() %>% colSums()
## state FIPS minpop maxpop nnA n
## 0 0 0 0 0 0
cty65 %>% filter(maxpop != minpop)
## # A tibble: 0 x 6
## # Groups: state [0]
## # ... with 6 variables: state <chr>, FIPS <chr>, minpop <dbl>, maxpop <dbl>,
## # nnA <int>, n <int>
vaxFix65_20220126 <- vaxPartialRaw_20220126 %>%
filter(state %in% c(state.abb, "DC"), FIPS != "UNK") %>%
select(-popgte65) %>%
left_join(select(cty65, state, FIPS, popgte65=maxpop), by=c("state", "FIPS"))
vaxFix65_20220126
## # A tibble: 1,285,078 x 10
## date FIPS countyName state vxcpoppct vxcgte18pct vxcgte65pct pop
## <date> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 2022-01-25 06031 Kings County CA 47.1 58 74.7 152940
## 2 2022-01-25 08035 Douglas Coun~ CO 71.6 82.4 95 351154
## 3 2022-01-25 24027 Howard County MD 82.9 92.6 95 325690
## 4 2022-01-25 37159 Rowan County NC 42.9 51.7 73.7 142088
## 5 2022-01-25 36095 Schoharie Co~ NY 55.5 62 76.1 30999
## 6 2022-01-25 21023 Bracken Coun~ KY 45.8 56 83.7 8303
## 7 2022-01-25 21121 Knox County KY 35.5 43.8 59.2 31145
## 8 2022-01-25 50009 Essex County VT 39.3 43.5 51.9 6163
## 9 2022-01-25 53075 Whitman Coun~ WA 45.4 48.8 82.6 50104
## 10 2022-01-25 48151 Fisher County TX 35.5 43.7 59.4 3830
## # ... with 1,285,068 more rows, and 2 more variables: popgte18 <dbl>,
## # popgte65 <dbl>
vaxState_20220126 <- vaxFix65_20220126 %>%
filter(state %in% c(state.abb, "DC"), FIPS != "UNK") %>%
group_by(date, state) %>%
summarize(ctypoppct=sum(vxcpoppct*pop)/sum(pop),
ctygte18pct=sum(vxcgte18pct*popgte18)/sum(popgte18),
ctygte65pct=sum(vxcgte65pct*popgte65)/sum(popgte65),
across(starts_with("pop"), sum, na.rm=TRUE),
.groups="drop"
)
allState_20220126 <- readFromRDS("cdc_daily_220103")$dfPerCapita %>%
select(state, date, vxcpoppct, vxcgte18pct, vxcgte65pct) %>%
filter(state %in% c(state.abb, "DC")) %>%
right_join(vaxState_20220126, by=c("state", "date"))
allState_20220126
## # A tibble: 20,859 x 11
## state date vxcpoppct vxcgte18pct vxcgte65pct ctypoppct ctygte18pct
## <chr> <date> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AK 2020-12-13 NA NA NA 0 0
## 2 AL 2020-12-13 NA NA NA 0 0
## 3 AR 2020-12-13 NA NA NA 0 0
## 4 AZ 2020-12-13 NA NA NA 0 0
## 5 CA 2020-12-13 NA NA NA 0 0
## 6 CO 2020-12-13 NA NA NA 0 0
## 7 CT 2020-12-13 NA NA NA 0 0
## 8 DC 2020-12-13 NA NA NA 0 0
## 9 DE 2020-12-13 NA NA NA 0 0
## 10 FL 2020-12-13 NA NA NA 0 0
## # ... with 20,849 more rows, and 4 more variables: ctygte65pct <dbl>,
## # pop <dbl>, popgte18 <dbl>, popgte65 <dbl>
allState_20220126 %>%
filter(state=="FL", date >= "2020-12-15") %>%
select(-starts_with("pop")) %>%
pivot_longer(-c(state, date)) %>%
mutate(src=ifelse(str_detect(name, "^cty"), "Summed County", "State"),
age=case_when(str_detect(name, "65pct") ~ "65+",
str_detect(name, "18pct") ~ "18+",
TRUE ~ "All")
) %>%
ggplot(aes(x=date, y=value)) +
geom_line(aes(group=src, color=src)) +
facet_wrap(~age) +
labs(x=NULL, y="% Fully Vaccinated", title="Evolution of FL vaccinated data") +
lims(y=c(0, 100)) +
scale_color_discrete("Source")
## Warning: Removed 26 row(s) containing missing values (geom_path).